mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 08:48:32 +08:00
Merge #2468
2468: Update milli 0.29 r=Kerollmops a=ManyTheFish - [x] Update milli to 0.29 - [x] Integrate charabia - [x] Set disabled_words to default when Index::exact_words returns None - [x] Fix ranking rules integration test fixes #2375 fixes #2144 fixes #2417 fixes #2407 Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
commit
6171f17f1d
99
Cargo.lock
generated
99
Cargo.lock
generated
@ -643,12 +643,33 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "character_converter"
|
||||
version = "1.0.0"
|
||||
name = "charabia"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c"
|
||||
checksum = "4a26a3df4d9c9231eb1e757fe6b1c66c471e0c2cd5410265e7c3109a726663c4"
|
||||
dependencies = [
|
||||
"character_converter",
|
||||
"cow-utils",
|
||||
"deunicode",
|
||||
"fst",
|
||||
"jieba-rs",
|
||||
"lindera",
|
||||
"lindera-core",
|
||||
"once_cell",
|
||||
"slice-group-by",
|
||||
"unicode-segmentation",
|
||||
"whatlang",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "character_converter"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d7064c6e919124b6541c52fef59d88c3c3eabdf4bc97c13b14551df775aead02"
|
||||
dependencies = [
|
||||
"bincode",
|
||||
"fst",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1102,8 +1123,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "filter-parser"
|
||||
version = "0.28.0"
|
||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
|
||||
version = "0.29.1"
|
||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
|
||||
dependencies = [
|
||||
"nom",
|
||||
"nom_locate",
|
||||
@ -1127,8 +1148,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "flatten-serde-json"
|
||||
version = "0.28.0"
|
||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
|
||||
version = "0.29.1"
|
||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
|
||||
dependencies = [
|
||||
"serde_json",
|
||||
]
|
||||
@ -1640,8 +1661,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "json-depth-checker"
|
||||
version = "0.28.0"
|
||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
|
||||
version = "0.29.1"
|
||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
|
||||
dependencies = [
|
||||
"serde_json",
|
||||
]
|
||||
@ -1719,9 +1740,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lindera"
|
||||
version = "0.12.6"
|
||||
version = "0.13.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3dea10df226936ff54f16d3922500e08ef4be2ba7c0070bec9ad4a1474316111"
|
||||
checksum = "7d1c5db4b1d12637aa316dc1adb215f78fe79025080af750942516c5ff17d1a0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
@ -1741,9 +1762,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lindera-cc-cedict-builder"
|
||||
version = "0.12.6"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4392785248c3d8755c6fae9d0086d27ad7a1d6810155a2494fe5206e2021f471"
|
||||
checksum = "73a3509fb497340571d49feddb57e1db2ce5248c4d449f2548d0ee8cb745eb1e"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
@ -1761,9 +1782,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lindera-core"
|
||||
version = "0.12.6"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "af63a4484334d4b83277621f1ba62fb83472858cc37fb4ab2181a4c19eebcb38"
|
||||
checksum = "5d20d1b2c085393aed58625d741beca69410e1143fc35bc67ebc35c9885f9f74"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
@ -1777,9 +1798,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lindera-decompress"
|
||||
version = "0.12.6"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "817ee62bc8973ec2457805df83796c59f074e49a4a0ee9baffe2663fe157f54a"
|
||||
checksum = "b96b8050cded13927a99bcb8cbb0987f89fc8f35429fc153b4bc05ddc7a53a44"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"lzma-rs",
|
||||
@ -1788,9 +1809,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lindera-dictionary"
|
||||
version = "0.12.6"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fd57501ee44a6aba0431d043c7926347e29883a79d8fc3955b8837e4ad1fee3c"
|
||||
checksum = "5abe3dddc22303402957edb4472ab0c996e0d93b3b00643de3bee8b28c2f9297"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
@ -1800,9 +1821,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lindera-ipadic"
|
||||
version = "0.12.6"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ade3bd3faa5f0db629c26264663e901dee5f46221eb04c2c7b592bd7485d44f9"
|
||||
checksum = "b8f4c111f6ad9eb9e015d02061af2ed36fc0255f29359294415c7c2f1ea5b5b6"
|
||||
dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
@ -1817,9 +1838,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lindera-ipadic-builder"
|
||||
version = "0.12.6"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee61f8dd6566738c5fd0ee9b1c11212ffc2d1f97af69c08a02cbb5c49995250a"
|
||||
checksum = "a2b9893f22a4a7511ac70ff7d96cda9b8d7259b7d7121784183c73bc593ce6e7"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
@ -1837,9 +1858,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lindera-ko-dic-builder"
|
||||
version = "0.12.6"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01f05950d9adc7aa42aa8b16be1616f9625576c867179ac29372714eaed6993d"
|
||||
checksum = "14282600ebfe7ab6fd4f3042143024ff9d74c09d58fd983d0c587839cf940d4a"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
@ -1857,9 +1878,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lindera-unidic-builder"
|
||||
version = "0.12.6"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3836c1278b8309ebf209c67bc7a935f4ce7c9246a578b250540398806a40b81d"
|
||||
checksum = "b20825d46c95854e47c532c3e548dfec07c8f187c1ed89383cb6c35790338088"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bincode",
|
||||
@ -2142,24 +2163,6 @@ dependencies = [
|
||||
"whoami",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch-tokenizer"
|
||||
version = "0.2.9"
|
||||
source = "git+https://github.com/meilisearch/tokenizer.git?tag=v0.2.9#1dfc8ad9f5b338c39c3bc5fd5b2d0c1328314ddc"
|
||||
dependencies = [
|
||||
"character_converter",
|
||||
"cow-utils",
|
||||
"deunicode",
|
||||
"fst",
|
||||
"jieba-rs",
|
||||
"lindera",
|
||||
"lindera-core",
|
||||
"once_cell",
|
||||
"slice-group-by",
|
||||
"unicode-segmentation",
|
||||
"whatlang",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.5.0"
|
||||
@ -2186,13 +2189,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "milli"
|
||||
version = "0.28.0"
|
||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
|
||||
version = "0.29.1"
|
||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
|
||||
dependencies = [
|
||||
"bimap",
|
||||
"bincode",
|
||||
"bstr",
|
||||
"byteorder",
|
||||
"charabia",
|
||||
"concat-arrays",
|
||||
"crossbeam-channel",
|
||||
"csv",
|
||||
@ -2209,7 +2213,6 @@ dependencies = [
|
||||
"levenshtein_automata",
|
||||
"log",
|
||||
"logging_timer",
|
||||
"meilisearch-tokenizer",
|
||||
"memmap2",
|
||||
"obkv",
|
||||
"once_cell",
|
||||
|
@ -8,7 +8,7 @@ base64 = "0.13.0"
|
||||
enum-iterator = "0.7.0"
|
||||
hmac = "0.12.1"
|
||||
meilisearch-error = { path = "../meilisearch-error" }
|
||||
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" }
|
||||
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.29.1" }
|
||||
rand = "0.8.4"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = { version = "1.0.79", features = ["preserve_order"] }
|
||||
|
@ -89,9 +89,9 @@ impl Index<'_> {
|
||||
}
|
||||
|
||||
pub async fn wait_task(&self, update_id: u64) -> Value {
|
||||
// try 10 times to get status, or panic to not wait forever
|
||||
// try several times to get status, or panic to not wait forever
|
||||
let url = format!("/tasks/{}", update_id);
|
||||
for _ in 0..10 {
|
||||
for _ in 0..100 {
|
||||
let (response, status_code) = self.service.get(&url).await;
|
||||
assert_eq!(200, status_code, "response: {}", response);
|
||||
|
||||
@ -99,7 +99,8 @@ impl Index<'_> {
|
||||
return response;
|
||||
}
|
||||
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
// wait 0.5 second.
|
||||
sleep(Duration::from_millis(500)).await;
|
||||
}
|
||||
panic!("Timeout waiting for update id");
|
||||
}
|
||||
|
@ -43,7 +43,7 @@ async fn get_document() {
|
||||
]);
|
||||
let (_, code) = index.add_documents(documents, None).await;
|
||||
assert_eq!(code, 202);
|
||||
index.wait_task(0).await;
|
||||
index.wait_task(1).await;
|
||||
let (response, code) = index.get_document(0, None).await;
|
||||
assert_eq!(code, 200);
|
||||
assert_eq!(
|
||||
@ -306,7 +306,7 @@ async fn get_document_s_nested_attributes_to_retrieve() {
|
||||
]);
|
||||
let (_, code) = index.add_documents(documents, None).await;
|
||||
assert_eq!(code, 202);
|
||||
index.wait_task(0).await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
let (response, code) = index
|
||||
.get_document(
|
||||
|
@ -283,7 +283,7 @@ async fn error_set_invalid_ranking_rules() {
|
||||
assert_eq!(response["status"], "failed");
|
||||
|
||||
let expected_error = json!({
|
||||
"message": r#"`manyTheFish` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules."#,
|
||||
"message": r#"`manyTheFish` ranking rule is invalid. Valid ranking rules are words, typo, sort, proximity, attribute, exactness and custom ranking rules."#,
|
||||
"code": "invalid_ranking_rule",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_ranking_rule"
|
||||
|
@ -30,7 +30,7 @@ lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-error = { path = "../meilisearch-error" }
|
||||
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" }
|
||||
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.29.1" }
|
||||
mime = "0.3.16"
|
||||
num_cpus = "1.13.1"
|
||||
obkv = "0.2.0"
|
||||
|
@ -175,12 +175,10 @@ impl Index {
|
||||
two_typos: Setting::Set(self.min_word_len_two_typos(txn)?),
|
||||
};
|
||||
|
||||
let disabled_words = self
|
||||
.exact_words(txn)?
|
||||
.into_stream()
|
||||
.into_strs()?
|
||||
.into_iter()
|
||||
.collect();
|
||||
let disabled_words = match self.exact_words(txn)? {
|
||||
Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(),
|
||||
None => BTreeSet::new(),
|
||||
};
|
||||
|
||||
let disabled_attributes = self
|
||||
.exact_attributes(txn)?
|
||||
|
@ -4,7 +4,7 @@ use std::str::FromStr;
|
||||
use std::time::Instant;
|
||||
|
||||
use either::Either;
|
||||
use milli::tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use milli::tokenizer::TokenizerBuilder;
|
||||
use milli::{
|
||||
AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, MatchBounds, MatcherBuilder, SortError,
|
||||
};
|
||||
@ -175,12 +175,9 @@ impl Index {
|
||||
&displayed_ids,
|
||||
);
|
||||
|
||||
let stop_words = fst::Set::default();
|
||||
let mut config = AnalyzerConfig::default();
|
||||
config.stop_words(&stop_words);
|
||||
let analyzer = Analyzer::new(config);
|
||||
let tokenizer = TokenizerBuilder::default().build();
|
||||
|
||||
let mut formatter_builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer);
|
||||
formatter_builder.crop_marker(query.crop_marker);
|
||||
formatter_builder.highlight_prefix(query.highlight_pre_tag);
|
||||
formatter_builder.highlight_suffix(query.highlight_post_tag);
|
||||
@ -204,7 +201,6 @@ impl Index {
|
||||
&displayed_document,
|
||||
&fields_ids_map,
|
||||
&formatter_builder,
|
||||
&analyzer,
|
||||
&formatted_options,
|
||||
query.show_matches_position,
|
||||
&displayed_ids,
|
||||
@ -414,8 +410,7 @@ fn make_document(
|
||||
fn format_fields<'a, A: AsRef<[u8]>>(
|
||||
document: &Document,
|
||||
field_ids_map: &FieldsIdsMap,
|
||||
builder: &MatcherBuilder,
|
||||
analyzer: &'a Analyzer<'a, A>,
|
||||
builder: &MatcherBuilder<'a, A>,
|
||||
formatted_options: &BTreeMap<FieldId, FormatOptions>,
|
||||
compute_matches: bool,
|
||||
displayable_ids: &BTreeSet<FieldId>,
|
||||
@ -446,7 +441,6 @@ fn format_fields<'a, A: AsRef<[u8]>>(
|
||||
std::mem::take(value),
|
||||
builder,
|
||||
format,
|
||||
analyzer,
|
||||
&mut infos,
|
||||
compute_matches,
|
||||
);
|
||||
@ -470,19 +464,14 @@ fn format_fields<'a, A: AsRef<[u8]>>(
|
||||
|
||||
fn format_value<'a, A: AsRef<[u8]>>(
|
||||
value: Value,
|
||||
builder: &MatcherBuilder,
|
||||
builder: &MatcherBuilder<'a, A>,
|
||||
format_options: Option<FormatOptions>,
|
||||
analyzer: &'a Analyzer<'a, A>,
|
||||
infos: &mut Vec<MatchBounds>,
|
||||
compute_matches: bool,
|
||||
) -> Value {
|
||||
match value {
|
||||
Value::String(old_string) => {
|
||||
// this will be removed with charabia
|
||||
let analyzed = analyzer.analyze(&old_string);
|
||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||
|
||||
let mut matcher = builder.build(&tokens[..], &old_string);
|
||||
let mut matcher = builder.build(&old_string);
|
||||
if compute_matches {
|
||||
let matches = matcher.matches();
|
||||
infos.extend_from_slice(&matches[..]);
|
||||
@ -507,7 +496,6 @@ fn format_value<'a, A: AsRef<[u8]>>(
|
||||
highlight: format_options.highlight,
|
||||
crop: None,
|
||||
}),
|
||||
analyzer,
|
||||
infos,
|
||||
compute_matches,
|
||||
)
|
||||
@ -527,7 +515,6 @@ fn format_value<'a, A: AsRef<[u8]>>(
|
||||
highlight: format_options.highlight,
|
||||
crop: None,
|
||||
}),
|
||||
analyzer,
|
||||
infos,
|
||||
compute_matches,
|
||||
),
|
||||
@ -536,12 +523,9 @@ fn format_value<'a, A: AsRef<[u8]>>(
|
||||
.collect(),
|
||||
),
|
||||
Value::Number(number) => {
|
||||
// this will be removed with charabia
|
||||
let s = number.to_string();
|
||||
let analyzed = analyzer.analyze(&s);
|
||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||
|
||||
let mut matcher = builder.build(&tokens[..], &s);
|
||||
let mut matcher = builder.build(&s);
|
||||
if compute_matches {
|
||||
let matches = matcher.matches();
|
||||
infos.extend_from_slice(&matches[..]);
|
||||
|
Loading…
Reference in New Issue
Block a user