[package] name = "milli" edition = "2021" publish = false version.workspace = true authors.workspace = true description.workspace = true homepage.workspace = true readme.workspace = true # edition.workspace = true license.workspace = true [dependencies] bimap = { version = "0.6.3", features = ["serde"] } bincode = "1.3.3" bstr = "1.9.1" bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] } byteorder = "1.5.0" charabia = { version = "0.8.12", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.13" deserr = "0.6.2" either = { version = "1.13.0", features = ["serde"] } flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" grenad = { version = "0.4.7", default-features = false, features = [ "rayon", "tempfile", ] } heed = { version = "0.20.3", default-features = false, features = [ "serde-json", "serde-bincode", "read-txn-no-tls", ] } indexmap = { version = "2.2.6", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memchr = "2.5.0" memmap2 = "0.9.4" obkv = "0.2.2" once_cell = "1.19.0" ordered-float = "4.2.1" rayon = "1.10.0" roaring = { version = "0.10.6", features = ["serde"] } rstar = { version = "0.12.0", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } slice-group-by = "0.3.1" smallstr = { version = "0.3.0", features = ["serde"] } smallvec = "1.13.2" smartstring = "1.0.1" tempfile = "3.10.1" thiserror = "1.0.61" time = { version = "0.3.36", features = [ "serde-well-known", "formatting", "parsing", "macros", ] } uuid = { version = "1.10.0", features = ["v4"] } filter-parser = { path = "../filter-parser" } # documents words self-join itertools = "0.13.0" redis = "0.25.4" csv = "1.3.0" candle-core = { version = "0.6.0" } candle-transformers = { version = "0.6.0" } candle-nn = { version = "0.6.0" } tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [ "onig", ] } hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [ "online", ] } tiktoken-rs = "0.5.9" liquid = "0.26.6" rhai = { version = "1.19.0", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } arroy = "0.4.0" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.10.0", features = ["json"] } url = "2.5.2" rayon-par-bridge = "0.1.0" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } big_s = "1.0.2" insta = "1.39.0" maplit = "1.0.2" md5 = "0.7.0" meili-snap = { path = "../meili-snap" } rand = { version = "0.8.5", features = ["small_rng"] } [features] all-tokenizations = [ "charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer", "charabia/vietnamese", ] # Use POSIX semaphores instead of SysV semaphores in LMDB # For more information on this feature, see heed's Cargo.toml lmdb-posix-sem = ["heed/posix-sem"] # allow chinese specialized tokenization chinese = ["charabia/chinese"] chinese-pinyin = ["chinese", "charabia/chinese-normalization-pinyin"] # allow hebrew specialized tokenization hebrew = ["charabia/hebrew"] # allow japanese specialized tokenization japanese = ["charabia/japanese"] japanese-transliteration = ["charabia/japanese-transliteration"] # allow korean specialized tokenization korean = ["charabia/korean"] # allow thai specialized tokenization thai = ["charabia/thai"] # allow greek specialized tokenization greek = ["charabia/greek"] # allow khmer specialized tokenization khmer = ["charabia/khmer"] # allow vietnamese specialized tokenization vietnamese = ["charabia/vietnamese"] # force swedish character recomposition swedish-recomposition = ["charabia/swedish-recomposition"] # allow CUDA support, see cuda = ["candle-core/cuda"]