mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Merge #3866
3866: Update charabia v0.8.0 r=dureuill a=ManyTheFish # Pull Request Update Charabia: - enhance Japanese segmentation - enhance Latin Tokenization - words containing `_` are now properly segmented into several words - brackets `{([])}` are no more considered as context separators so word separated by brackets are now considered near together for the proximity ranking rule - fixes #3815 - fixes #3778 - fixes [product#151](https://github.com/meilisearch/product/discussions/151) > Important note: now the float numbers are segmented around the `.` so `3.22` is segmented as [`3`, `.`, `22`] but the middle dot isn't considered as a hard separator, which means that if we search `3.22` we find documents containing `3.22` Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
commit
661d1f90dc
222
Cargo.lock
generated
222
Cargo.lock
generated
@ -152,7 +152,7 @@ dependencies = [
|
|||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"tokio-rustls 0.23.4",
|
"tokio-rustls 0.23.4",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"webpki-roots",
|
"webpki-roots 0.22.6",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -705,24 +705,27 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "charabia"
|
name = "charabia"
|
||||||
version = "0.7.2"
|
version = "0.8.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "413155d93157bff9130895c3bd83970ac7f35659ca57226a96aa35cf1e8e102c"
|
checksum = "bb49850f555eb71aa6fc6d4d79420e81f4d89fa56e0e9c0f6d19aace2f56c554"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
"cow-utils",
|
"cow-utils",
|
||||||
"csv",
|
"csv",
|
||||||
"deunicode",
|
"deunicode",
|
||||||
|
"either",
|
||||||
"finl_unicode",
|
"finl_unicode",
|
||||||
"fst",
|
"fst",
|
||||||
"irg-kvariants",
|
"irg-kvariants",
|
||||||
"jieba-rs",
|
"jieba-rs",
|
||||||
"lindera",
|
"lindera-core",
|
||||||
|
"lindera-dictionary",
|
||||||
|
"lindera-tokenizer",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"pinyin",
|
"pinyin",
|
||||||
"serde",
|
"serde",
|
||||||
"slice-group-by",
|
"slice-group-by",
|
||||||
"unicode-normalization",
|
"unicode-normalization",
|
||||||
"unicode-segmentation",
|
|
||||||
"wana_kana",
|
"wana_kana",
|
||||||
"whatlang",
|
"whatlang",
|
||||||
]
|
]
|
||||||
@ -2135,15 +2138,6 @@ dependencies = [
|
|||||||
"simple_asn1",
|
"simple_asn1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "kanaria"
|
|
||||||
version = "0.2.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff"
|
|
||||||
dependencies = [
|
|
||||||
"bitflags",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "language-tags"
|
name = "language-tags"
|
||||||
version = "0.3.2"
|
version = "0.3.2"
|
||||||
@ -2211,38 +2205,11 @@ dependencies = [
|
|||||||
"vcpkg",
|
"vcpkg",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "lindera"
|
|
||||||
version = "0.23.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "72be283281bec2768687b1784be03a678609b51f2f90f6f9d9b4f07953e6dd25"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"bincode",
|
|
||||||
"byteorder",
|
|
||||||
"encoding",
|
|
||||||
"kanaria",
|
|
||||||
"lindera-cc-cedict-builder",
|
|
||||||
"lindera-core",
|
|
||||||
"lindera-dictionary",
|
|
||||||
"lindera-filter",
|
|
||||||
"lindera-ipadic-builder",
|
|
||||||
"lindera-ko-dic-builder",
|
|
||||||
"lindera-unidic-builder",
|
|
||||||
"regex",
|
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"thiserror",
|
|
||||||
"unicode-blocks",
|
|
||||||
"unicode-normalization",
|
|
||||||
"yada",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lindera-cc-cedict-builder"
|
name = "lindera-cc-cedict-builder"
|
||||||
version = "0.23.0"
|
version = "0.25.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "10fbafd37adab44ccc2668a40fba2dbc4e665cb3c36018c15dfe2e2b830e28ce"
|
checksum = "4c6bf79b29a90bcd22036e494d6cc9ac3abe9ab604b21f3258ba6dc1ce501801"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bincode",
|
"bincode",
|
||||||
@ -2259,9 +2226,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lindera-compress"
|
name = "lindera-compress"
|
||||||
version = "0.23.0"
|
version = "0.25.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ed9196bf5995503f6878a090dfee6114ba86430c72f67ef3624246b564869937"
|
checksum = "8f2e99e67736352bbb6ed1c273643975822505067ca32194b0981040bc50527a"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"flate2",
|
"flate2",
|
||||||
@ -2270,9 +2237,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lindera-core"
|
name = "lindera-core"
|
||||||
version = "0.23.0"
|
version = "0.25.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e5f0baa9932f682e9c5b388897330f155d3c40de80016e60125897fde5e0e246"
|
checksum = "7c3935e966409156f22cb4b334b21b0dce84b7aa1cad62214b466489d249c8e5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bincode",
|
"bincode",
|
||||||
@ -2287,9 +2254,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lindera-decompress"
|
name = "lindera-decompress"
|
||||||
version = "0.23.0"
|
version = "0.25.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a6e63fa6ef0bc3ce2c26d372aa6185b7a316194494a84f81678f5da2893bf4a2"
|
checksum = "7476406abb63c49d7f59c88b9b868ee8d2981495ea7e2c3ad129902f9916b3c6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"flate2",
|
"flate2",
|
||||||
@ -2298,63 +2265,50 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lindera-dictionary"
|
name = "lindera-dictionary"
|
||||||
version = "0.23.0"
|
version = "0.25.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fd765c36166016de87a1f447ea971573e4c63e334836c46ad0020f0408c88bfc"
|
checksum = "808b7d2b3cabc25a4022526d484a4cfd1d5924dc76a26e0379707698841acef2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bincode",
|
"bincode",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
|
"lindera-cc-cedict-builder",
|
||||||
"lindera-core",
|
"lindera-core",
|
||||||
"lindera-ipadic",
|
|
||||||
"lindera-ko-dic",
|
|
||||||
"serde",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "lindera-filter"
|
|
||||||
version = "0.23.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "a5345e37fb9521ab3cee19283bed135d46b3521dc1fd13a49fa0992379056203"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"bincode",
|
|
||||||
"byteorder",
|
|
||||||
"kanaria",
|
|
||||||
"lindera-core",
|
|
||||||
"lindera-dictionary",
|
|
||||||
"once_cell",
|
|
||||||
"regex",
|
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"unicode-blocks",
|
|
||||||
"unicode-normalization",
|
|
||||||
"unicode-segmentation",
|
|
||||||
"yada",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "lindera-ipadic"
|
|
||||||
version = "0.23.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "60eeb356295f784e7db4cfd2c6772f2bd059e565a7744e246642a07bc333a88a"
|
|
||||||
dependencies = [
|
|
||||||
"bincode",
|
|
||||||
"byteorder",
|
|
||||||
"encoding",
|
|
||||||
"flate2",
|
|
||||||
"lindera-core",
|
|
||||||
"lindera-decompress",
|
|
||||||
"lindera-ipadic-builder",
|
"lindera-ipadic-builder",
|
||||||
"once_cell",
|
"lindera-ipadic-neologd-builder",
|
||||||
"tar",
|
"lindera-ko-dic",
|
||||||
|
"lindera-ko-dic-builder",
|
||||||
|
"lindera-unidic",
|
||||||
|
"lindera-unidic-builder",
|
||||||
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lindera-ipadic-builder"
|
name = "lindera-ipadic-builder"
|
||||||
version = "0.23.0"
|
version = "0.25.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0a16a2a88db9d956f5086bc976deb9951ca2dbbfef41a002df0a7bfb2c845aab"
|
checksum = "31f373a280958c930e5ee4a1e4db3a0ee0542afaf02d3b5cacb8cab4e298648e"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"bincode",
|
||||||
|
"byteorder",
|
||||||
|
"csv",
|
||||||
|
"encoding_rs",
|
||||||
|
"encoding_rs_io",
|
||||||
|
"env_logger",
|
||||||
|
"glob",
|
||||||
|
"lindera-core",
|
||||||
|
"lindera-decompress",
|
||||||
|
"log",
|
||||||
|
"serde",
|
||||||
|
"yada",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lindera-ipadic-neologd-builder"
|
||||||
|
version = "0.25.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "92eff98e9ed1a7a412b91709c2343457a04ef02fa0c27c27e3a5892f5591eae9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bincode",
|
"bincode",
|
||||||
@ -2364,7 +2318,6 @@ dependencies = [
|
|||||||
"encoding_rs_io",
|
"encoding_rs_io",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"glob",
|
"glob",
|
||||||
"lindera-compress",
|
|
||||||
"lindera-core",
|
"lindera-core",
|
||||||
"lindera-decompress",
|
"lindera-decompress",
|
||||||
"log",
|
"log",
|
||||||
@ -2374,9 +2327,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lindera-ko-dic"
|
name = "lindera-ko-dic"
|
||||||
version = "0.23.0"
|
version = "0.25.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "abb479b170a841b8cfbe602d772e30849ffe0562b219190a378368968b8c8f66"
|
checksum = "74c6d5bf7d8092bd6d10de7a5d74b70ea7cf234586235b0d6cdb903b05a6c9e2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bincode",
|
"bincode",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@ -2391,9 +2344,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lindera-ko-dic-builder"
|
name = "lindera-ko-dic-builder"
|
||||||
version = "0.23.0"
|
version = "0.25.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9b9b58213552560717c48e7833444a20d2d7fe26a6e565f7ce0cbbf85784c7cf"
|
checksum = "f0a4add6d3c1e41ec9e2690d33e287d0223fb59a30ccee4980c23f31368cae1e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bincode",
|
"bincode",
|
||||||
@ -2410,10 +2363,42 @@ dependencies = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lindera-unidic-builder"
|
name = "lindera-tokenizer"
|
||||||
version = "0.23.0"
|
version = "0.25.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6858147cdaf4a7b564c08a247449d3aca38e9b4812499651af08afbf85324596"
|
checksum = "cb6a8acbd068019d1cdac7316f0dcb87f8e33ede2b13aa237f45114f9750afb8"
|
||||||
|
dependencies = [
|
||||||
|
"bincode",
|
||||||
|
"byteorder",
|
||||||
|
"lindera-core",
|
||||||
|
"lindera-dictionary",
|
||||||
|
"once_cell",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lindera-unidic"
|
||||||
|
version = "0.25.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "14abf0613d350b30d3b0406a33b1de8fa8d829f26516909421702174785991c8"
|
||||||
|
dependencies = [
|
||||||
|
"bincode",
|
||||||
|
"byteorder",
|
||||||
|
"encoding",
|
||||||
|
"lindera-core",
|
||||||
|
"lindera-decompress",
|
||||||
|
"lindera-unidic-builder",
|
||||||
|
"once_cell",
|
||||||
|
"ureq",
|
||||||
|
"zip",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lindera-unidic-builder"
|
||||||
|
version = "0.25.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e204ed53d9bd63227d1e6a6c1f122ca039e00a8634ac32e7fb0281eeec8615c4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bincode",
|
"bincode",
|
||||||
@ -2422,6 +2407,7 @@ dependencies = [
|
|||||||
"encoding",
|
"encoding",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"glob",
|
"glob",
|
||||||
|
"lindera-compress",
|
||||||
"lindera-core",
|
"lindera-core",
|
||||||
"lindera-decompress",
|
"lindera-decompress",
|
||||||
"log",
|
"log",
|
||||||
@ -3428,7 +3414,7 @@ dependencies = [
|
|||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
"wasm-bindgen-futures",
|
"wasm-bindgen-futures",
|
||||||
"web-sys",
|
"web-sys",
|
||||||
"webpki-roots",
|
"webpki-roots 0.22.6",
|
||||||
"winreg",
|
"winreg",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -4211,12 +4197,6 @@ version = "0.3.13"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
|
checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "unicode-blocks"
|
|
||||||
version = "0.1.6"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "943e3f1f50cc455d072e0801ccb71ff893b0c88060b1169f92e35fb5bb881cc6"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-ident"
|
name = "unicode-ident"
|
||||||
version = "1.0.9"
|
version = "1.0.9"
|
||||||
@ -4250,6 +4230,21 @@ version = "0.7.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
|
checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ureq"
|
||||||
|
version = "2.7.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9"
|
||||||
|
dependencies = [
|
||||||
|
"base64 0.21.2",
|
||||||
|
"log",
|
||||||
|
"once_cell",
|
||||||
|
"rustls 0.21.1",
|
||||||
|
"rustls-webpki",
|
||||||
|
"url",
|
||||||
|
"webpki-roots 0.23.1",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "url"
|
name = "url"
|
||||||
version = "2.3.1"
|
version = "2.3.1"
|
||||||
@ -4458,6 +4453,15 @@ dependencies = [
|
|||||||
"webpki",
|
"webpki",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "webpki-roots"
|
||||||
|
version = "0.23.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338"
|
||||||
|
dependencies = [
|
||||||
|
"rustls-webpki",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "whatlang"
|
name = "whatlang"
|
||||||
version = "0.16.2"
|
version = "0.16.2"
|
||||||
|
@ -814,10 +814,10 @@ fn extract_field(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn format_fields<A: AsRef<[u8]>>(
|
fn format_fields<'a>(
|
||||||
document: &Document,
|
document: &Document,
|
||||||
field_ids_map: &FieldsIdsMap,
|
field_ids_map: &FieldsIdsMap,
|
||||||
builder: &MatcherBuilder<'_, A>,
|
builder: &'a MatcherBuilder<'a>,
|
||||||
formatted_options: &BTreeMap<FieldId, FormatOptions>,
|
formatted_options: &BTreeMap<FieldId, FormatOptions>,
|
||||||
compute_matches: bool,
|
compute_matches: bool,
|
||||||
displayable_ids: &BTreeSet<FieldId>,
|
displayable_ids: &BTreeSet<FieldId>,
|
||||||
@ -862,9 +862,9 @@ fn format_fields<A: AsRef<[u8]>>(
|
|||||||
Ok((matches_position, document))
|
Ok((matches_position, document))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn format_value<A: AsRef<[u8]>>(
|
fn format_value<'a>(
|
||||||
value: Value,
|
value: Value,
|
||||||
builder: &MatcherBuilder<'_, A>,
|
builder: &'a MatcherBuilder<'a>,
|
||||||
format_options: Option<FormatOptions>,
|
format_options: Option<FormatOptions>,
|
||||||
infos: &mut Vec<MatchBounds>,
|
infos: &mut Vec<MatchBounds>,
|
||||||
compute_matches: bool,
|
compute_matches: bool,
|
||||||
|
@ -17,7 +17,7 @@ bincode = "1.3.3"
|
|||||||
bstr = "1.4.0"
|
bstr = "1.4.0"
|
||||||
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
|
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
|
||||||
byteorder = "1.4.3"
|
byteorder = "1.4.3"
|
||||||
charabia = { version = "0.7.2", default-features = false }
|
charabia = { version = "0.8.1", default-features = false }
|
||||||
concat-arrays = "0.1.2"
|
concat-arrays = "0.1.2"
|
||||||
crossbeam-channel = "0.5.8"
|
crossbeam-channel = "0.5.8"
|
||||||
deserr = "0.5.0"
|
deserr = "0.5.0"
|
||||||
|
@ -256,7 +256,8 @@ pub(crate) mod tests {
|
|||||||
let temp_index = temp_index_with_documents();
|
let temp_index = temp_index_with_documents();
|
||||||
let rtxn = temp_index.read_txn().unwrap();
|
let rtxn = temp_index.read_txn().unwrap();
|
||||||
let mut ctx = SearchContext::new(&temp_index, &rtxn);
|
let mut ctx = SearchContext::new(&temp_index, &rtxn);
|
||||||
let tokenizer = TokenizerBuilder::new().build();
|
let mut builder = TokenizerBuilder::default();
|
||||||
|
let tokenizer = builder.build();
|
||||||
let tokens = tokenizer.tokenize("split this world");
|
let tokens = tokenizer.tokenize("split this world");
|
||||||
let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||||
|
@ -12,16 +12,16 @@ const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
|
|||||||
const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
|
const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
|
||||||
|
|
||||||
/// Structure used to build a Matcher allowing to customize formating tags.
|
/// Structure used to build a Matcher allowing to customize formating tags.
|
||||||
pub struct MatcherBuilder<'a, A> {
|
pub struct MatcherBuilder<'m> {
|
||||||
matching_words: MatchingWords,
|
matching_words: MatchingWords,
|
||||||
tokenizer: Tokenizer<'a, 'a, A>,
|
tokenizer: Tokenizer<'m>,
|
||||||
crop_marker: Option<String>,
|
crop_marker: Option<String>,
|
||||||
highlight_prefix: Option<String>,
|
highlight_prefix: Option<String>,
|
||||||
highlight_suffix: Option<String>,
|
highlight_suffix: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, A> MatcherBuilder<'a, A> {
|
impl<'m> MatcherBuilder<'m> {
|
||||||
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self {
|
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
matching_words,
|
matching_words,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
@ -46,7 +46,7 @@ impl<'a, A> MatcherBuilder<'a, A> {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> {
|
pub fn build<'t>(&'m self, text: &'t str) -> Matcher<'t, 'm> {
|
||||||
let crop_marker = match &self.crop_marker {
|
let crop_marker = match &self.crop_marker {
|
||||||
Some(marker) => marker.as_str(),
|
Some(marker) => marker.as_str(),
|
||||||
None => DEFAULT_CROP_MARKER,
|
None => DEFAULT_CROP_MARKER,
|
||||||
@ -103,17 +103,17 @@ pub struct MatchBounds {
|
|||||||
|
|
||||||
/// Structure used to analize a string, compute words that match,
|
/// Structure used to analize a string, compute words that match,
|
||||||
/// and format the source string, returning a highlighted and cropped sub-string.
|
/// and format the source string, returning a highlighted and cropped sub-string.
|
||||||
pub struct Matcher<'t, 'm, A> {
|
pub struct Matcher<'t, 'm> {
|
||||||
text: &'t str,
|
text: &'t str,
|
||||||
matching_words: &'m MatchingWords,
|
matching_words: &'m MatchingWords,
|
||||||
tokenizer: &'m Tokenizer<'m, 'm, A>,
|
tokenizer: &'m Tokenizer<'m>,
|
||||||
crop_marker: &'m str,
|
crop_marker: &'m str,
|
||||||
highlight_prefix: &'m str,
|
highlight_prefix: &'m str,
|
||||||
highlight_suffix: &'m str,
|
highlight_suffix: &'m str,
|
||||||
matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
|
matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
impl<'t> Matcher<'t, '_> {
|
||||||
/// Iterates over tokens and save any of them that matches the query.
|
/// Iterates over tokens and save any of them that matches the query.
|
||||||
fn compute_matches(&mut self) -> &mut Self {
|
fn compute_matches(&mut self) -> &mut Self {
|
||||||
/// some words are counted as matches only if they are close together and in the good order,
|
/// some words are counted as matches only if they are close together and in the good order,
|
||||||
@ -503,7 +503,7 @@ mod tests {
|
|||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::{execute_search, SearchContext};
|
use crate::{execute_search, SearchContext};
|
||||||
|
|
||||||
impl<'a> MatcherBuilder<'a, &[u8]> {
|
impl<'a> MatcherBuilder<'a> {
|
||||||
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
|
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
|
||||||
let mut ctx = SearchContext::new(index, rtxn);
|
let mut ctx = SearchContext::new(index, rtxn);
|
||||||
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
|
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
|
||||||
@ -530,7 +530,7 @@ mod tests {
|
|||||||
None => MatchingWords::default(),
|
None => MatchingWords::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
MatcherBuilder::new(matching_words, TokenizerBuilder::new().build())
|
MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -690,7 +690,7 @@ mod tests {
|
|||||||
// should crop the phrase instead of croping around the match.
|
// should crop the phrase instead of croping around the match.
|
||||||
insta::assert_snapshot!(
|
insta::assert_snapshot!(
|
||||||
matcher.format(format_options),
|
matcher.format(format_options),
|
||||||
@"… Split The World is a book written by Emily Henry…"
|
@"…Split The World is a book written by Emily Henry…"
|
||||||
);
|
);
|
||||||
|
|
||||||
// Text containing some matches.
|
// Text containing some matches.
|
||||||
|
@ -7,7 +7,7 @@ use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
|||||||
/// Convert the tokenised search query into a list of located query terms.
|
/// Convert the tokenised search query into a list of located query terms.
|
||||||
pub fn located_query_terms_from_tokens(
|
pub fn located_query_terms_from_tokens(
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
query: NormalizedTokenIter<&[u8]>,
|
query: NormalizedTokenIter,
|
||||||
words_limit: Option<usize>,
|
words_limit: Option<usize>,
|
||||||
) -> Result<Vec<LocatedQueryTerm>> {
|
) -> Result<Vec<LocatedQueryTerm>> {
|
||||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||||
@ -303,7 +303,8 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn start_with_hard_separator() -> Result<()> {
|
fn start_with_hard_separator() -> Result<()> {
|
||||||
let tokenizer = TokenizerBuilder::new().build();
|
let mut builder = TokenizerBuilder::default();
|
||||||
|
let tokenizer = builder.build();
|
||||||
let tokens = tokenizer.tokenize(".");
|
let tokens = tokenizer.tokenize(".");
|
||||||
let index = temp_index_with_documents();
|
let index = temp_index_with_documents();
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
|
@ -128,10 +128,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_tokens_from_document<T: AsRef<[u8]>>(
|
fn extract_tokens_from_document(
|
||||||
obkv: &KvReader<FieldId>,
|
obkv: &KvReader<FieldId>,
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
tokenizer: &Tokenizer<T>,
|
tokenizer: &Tokenizer,
|
||||||
max_positions_per_attributes: u32,
|
max_positions_per_attributes: u32,
|
||||||
buffers: &mut Buffers,
|
buffers: &mut Buffers,
|
||||||
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
||||||
|
@ -1,18 +1,21 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/index_documents/mod.rs
|
source: milli/src/update/index_documents/mod.rs
|
||||||
---
|
---
|
||||||
|
0 [1, ]
|
||||||
1 [2, ]
|
1 [2, ]
|
||||||
10.0 [1, ]
|
10 [1, ]
|
||||||
12 [0, ]
|
12 [0, ]
|
||||||
1344 [3, ]
|
1344 [3, ]
|
||||||
2 [0, ]
|
2 [0, ]
|
||||||
23 [5, ]
|
23 [5, ]
|
||||||
25.99 [2, ]
|
25 [2, ]
|
||||||
3.5 [0, ]
|
3 [0, ]
|
||||||
35 [5, ]
|
35 [5, ]
|
||||||
4 [4, ]
|
4 [4, ]
|
||||||
42 [0, 5, ]
|
42 [0, 5, ]
|
||||||
456 [1, ]
|
456 [1, ]
|
||||||
|
5 [0, ]
|
||||||
|
99 [2, ]
|
||||||
adams [5, ]
|
adams [5, ]
|
||||||
adventure [1, ]
|
adventure [1, ]
|
||||||
alice [2, ]
|
alice [2, ]
|
||||||
@ -29,7 +32,7 @@ galaxy [5, ]
|
|||||||
guide [5, ]
|
guide [5, ]
|
||||||
half [4, ]
|
half [4, ]
|
||||||
harry [4, ]
|
harry [4, ]
|
||||||
hitchhiker' [5, ]
|
hitchhiker [5, ]
|
||||||
hobbit [3, ]
|
hobbit [3, ]
|
||||||
in [2, ]
|
in [2, ]
|
||||||
j [3, 4, ]
|
j [3, 4, ]
|
||||||
|
@ -1,19 +1,22 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/index_documents/mod.rs
|
source: milli/src/update/index_documents/mod.rs
|
||||||
---
|
---
|
||||||
|
0 [1, 7, ]
|
||||||
1 [2, ]
|
1 [2, ]
|
||||||
10.0 [1, 7, ]
|
10 [1, 7, ]
|
||||||
12 [0, 8, ]
|
12 [0, 8, ]
|
||||||
1344 [3, ]
|
1344 [3, ]
|
||||||
1813 [8, ]
|
1813 [8, ]
|
||||||
2 [0, 8, ]
|
2 [0, 8, ]
|
||||||
23 [5, ]
|
23 [5, ]
|
||||||
25.99 [2, ]
|
25 [2, ]
|
||||||
3.5 [0, 8, ]
|
3 [0, 8, ]
|
||||||
35 [5, ]
|
35 [5, ]
|
||||||
4 [4, 6, ]
|
4 [4, 6, ]
|
||||||
42 [0, 5, 8, ]
|
42 [0, 5, 8, ]
|
||||||
456 [1, 7, ]
|
456 [1, 7, ]
|
||||||
|
5 [0, 8, ]
|
||||||
|
99 [2, ]
|
||||||
adams [5, ]
|
adams [5, ]
|
||||||
adventure [1, 7, ]
|
adventure [1, 7, ]
|
||||||
alice [2, ]
|
alice [2, ]
|
||||||
@ -31,7 +34,7 @@ galaxy [5, ]
|
|||||||
guide [5, ]
|
guide [5, ]
|
||||||
half [4, 6, ]
|
half [4, 6, ]
|
||||||
harry [4, 6, ]
|
harry [4, 6, ]
|
||||||
hitchhiker' [5, ]
|
hitchhiker [5, ]
|
||||||
hobbit [3, ]
|
hobbit [3, ]
|
||||||
in [2, ]
|
in [2, ]
|
||||||
j [3, 4, 6, 8, ]
|
j [3, 4, 6, 8, ]
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
use charabia::{Tokenizer, TokenizerBuilder};
|
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
||||||
use deserr::{DeserializeError, Deserr};
|
use deserr::{DeserializeError, Deserr};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||||
@ -423,6 +423,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
match self.stop_words {
|
match self.stop_words {
|
||||||
Setting::Set(ref stop_words) => {
|
Setting::Set(ref stop_words) => {
|
||||||
let current = self.index.stop_words(self.wtxn)?;
|
let current = self.index.stop_words(self.wtxn)?;
|
||||||
|
|
||||||
|
// Apply an unlossy normalization on stop_words
|
||||||
|
let stop_words = stop_words
|
||||||
|
.iter()
|
||||||
|
.map(|w| w.as_str().normalize(&Default::default()).into_owned());
|
||||||
|
|
||||||
// since we can't compare a BTreeSet with an FST we are going to convert the
|
// since we can't compare a BTreeSet with an FST we are going to convert the
|
||||||
// BTreeSet to an FST and then compare bytes per bytes the two FSTs.
|
// BTreeSet to an FST and then compare bytes per bytes the two FSTs.
|
||||||
let fst = fst::Set::from_iter(stop_words)?;
|
let fst = fst::Set::from_iter(stop_words)?;
|
||||||
@ -446,7 +452,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
fn update_synonyms(&mut self) -> Result<bool> {
|
fn update_synonyms(&mut self) -> Result<bool> {
|
||||||
match self.synonyms {
|
match self.synonyms {
|
||||||
Setting::Set(ref synonyms) => {
|
Setting::Set(ref synonyms) => {
|
||||||
fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> {
|
fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
|
||||||
tokenizer
|
tokenizer
|
||||||
.tokenize(text)
|
.tokenize(text)
|
||||||
.filter_map(|token| {
|
.filter_map(|token| {
|
||||||
@ -647,7 +653,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
fn update_exact_words(&mut self) -> Result<()> {
|
fn update_exact_words(&mut self) -> Result<()> {
|
||||||
match self.exact_words {
|
match self.exact_words {
|
||||||
Setting::Set(ref mut words) => {
|
Setting::Set(ref mut words) => {
|
||||||
fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String {
|
fn normalize(tokenizer: &Tokenizer, text: &str) -> String {
|
||||||
tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect()
|
tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,27 +1,28 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/delete_documents.rs
|
source: milli/src/update/delete_documents.rs
|
||||||
---
|
---
|
||||||
1_36 [3, ]
|
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
|
||||||
1_37 [4, ]
|
2 [21, ]
|
||||||
1_38 [5, ]
|
36 [3, ]
|
||||||
1_39 [6, ]
|
37 [4, ]
|
||||||
1_40 [7, ]
|
38 [5, ]
|
||||||
1_41 [8, ]
|
39 [6, ]
|
||||||
1_42 [9, ]
|
40 [7, ]
|
||||||
1_43 [10, ]
|
41 [8, ]
|
||||||
1_44 [11, ]
|
42 [9, ]
|
||||||
1_45 [12, ]
|
43 [10, ]
|
||||||
1_46 [13, ]
|
44 [11, ]
|
||||||
1_47 [14, ]
|
45 [12, ]
|
||||||
1_5 [1, ]
|
46 [13, ]
|
||||||
1_52 [15, ]
|
47 [14, ]
|
||||||
1_57 [16, ]
|
5 [1, ]
|
||||||
1_58 [17, ]
|
52 [15, ]
|
||||||
1_68 [18, ]
|
57 [16, ]
|
||||||
1_69 [19, ]
|
58 [17, ]
|
||||||
1_7 [2, ]
|
68 [18, ]
|
||||||
1_71 [21, ]
|
69 [19, ]
|
||||||
2.2 [21, ]
|
7 [2, ]
|
||||||
|
71 [21, ]
|
||||||
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
|
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||||
aquarium [5, ]
|
aquarium [5, ]
|
||||||
art [4, 5, 8, 9, 10, 12, 17, ]
|
art [4, 5, 8, 9, 10, 12, 17, ]
|
||||||
|
@ -1,4 +1,25 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/delete_documents.rs
|
source: milli/src/update/delete_documents.rs
|
||||||
---
|
---
|
||||||
|
1 1 36 [3, ]
|
||||||
|
1 1 37 [4, ]
|
||||||
|
1 1 38 [5, ]
|
||||||
|
1 1 39 [6, ]
|
||||||
|
1 1 40 [7, ]
|
||||||
|
1 1 41 [8, ]
|
||||||
|
1 1 42 [9, ]
|
||||||
|
1 1 43 [10, ]
|
||||||
|
1 1 44 [11, ]
|
||||||
|
1 1 45 [12, ]
|
||||||
|
1 1 46 [13, ]
|
||||||
|
1 1 47 [14, ]
|
||||||
|
1 1 5 [1, ]
|
||||||
|
1 1 52 [15, ]
|
||||||
|
1 1 57 [16, ]
|
||||||
|
1 1 58 [17, ]
|
||||||
|
1 1 68 [18, ]
|
||||||
|
1 1 69 [19, ]
|
||||||
|
1 1 7 [2, ]
|
||||||
|
1 1 71 [21, ]
|
||||||
|
1 2 2 [21, ]
|
||||||
|
|
||||||
|
@ -1,31 +1,31 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/delete_documents.rs
|
source: milli/src/update/delete_documents.rs
|
||||||
---
|
---
|
||||||
1.2 [20, 22, ]
|
1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ]
|
||||||
1_36 [3, ]
|
2 [20, 21, 22, ]
|
||||||
1_37 [4, ]
|
36 [3, ]
|
||||||
1_38 [5, ]
|
37 [4, ]
|
||||||
1_39 [6, ]
|
38 [5, ]
|
||||||
1_4 [0, ]
|
39 [6, ]
|
||||||
1_40 [7, ]
|
4 [0, ]
|
||||||
1_41 [8, ]
|
40 [7, ]
|
||||||
1_42 [9, ]
|
41 [8, ]
|
||||||
1_43 [10, ]
|
42 [9, ]
|
||||||
1_44 [11, ]
|
43 [10, ]
|
||||||
1_45 [12, ]
|
44 [11, ]
|
||||||
1_46 [13, ]
|
45 [12, ]
|
||||||
1_47 [14, ]
|
46 [13, ]
|
||||||
1_5 [1, ]
|
47 [14, ]
|
||||||
1_52 [15, ]
|
5 [1, ]
|
||||||
1_57 [16, ]
|
52 [15, ]
|
||||||
1_58 [17, ]
|
57 [16, ]
|
||||||
1_68 [18, ]
|
58 [17, ]
|
||||||
1_69 [19, ]
|
68 [18, ]
|
||||||
1_7 [2, ]
|
69 [19, ]
|
||||||
1_70 [20, ]
|
7 [2, ]
|
||||||
1_71 [21, ]
|
70 [20, ]
|
||||||
1_72 [22, ]
|
71 [21, ]
|
||||||
2.2 [21, ]
|
72 [22, ]
|
||||||
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
|
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||||
aquarium [5, ]
|
aquarium [5, ]
|
||||||
art [4, 5, 8, 9, 10, 12, 17, ]
|
art [4, 5, 8, 9, 10, 12, 17, ]
|
||||||
|
@ -1,4 +1,29 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/delete_documents.rs
|
source: milli/src/update/delete_documents.rs
|
||||||
---
|
---
|
||||||
|
1 1 2 [20, 22, ]
|
||||||
|
1 1 36 [3, ]
|
||||||
|
1 1 37 [4, ]
|
||||||
|
1 1 38 [5, ]
|
||||||
|
1 1 39 [6, ]
|
||||||
|
1 1 4 [0, ]
|
||||||
|
1 1 40 [7, ]
|
||||||
|
1 1 41 [8, ]
|
||||||
|
1 1 42 [9, ]
|
||||||
|
1 1 43 [10, ]
|
||||||
|
1 1 44 [11, ]
|
||||||
|
1 1 45 [12, ]
|
||||||
|
1 1 46 [13, ]
|
||||||
|
1 1 47 [14, ]
|
||||||
|
1 1 5 [1, ]
|
||||||
|
1 1 52 [15, ]
|
||||||
|
1 1 57 [16, ]
|
||||||
|
1 1 58 [17, ]
|
||||||
|
1 1 68 [18, ]
|
||||||
|
1 1 69 [19, ]
|
||||||
|
1 1 7 [2, ]
|
||||||
|
1 1 70 [20, ]
|
||||||
|
1 1 71 [21, ]
|
||||||
|
1 1 72 [22, ]
|
||||||
|
1 2 2 [21, ]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user