From 40ef9a3c6a9ce73532a13c2419c6a4fa2d0a55e2 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 6 Apr 2021 15:41:03 +0200 Subject: [PATCH] push a first implementation of the stop_words --- Cargo.lock | 30 +++++++++++++++---- meilisearch-http/Cargo.toml | 2 +- meilisearch-http/src/index/mod.rs | 15 +++++++++- meilisearch-http/src/index/updates.rs | 19 ++++++++++-- meilisearch-http/src/routes/settings/mod.rs | 9 +++++- .../tests/settings/get_settings.rs | 6 ++-- 6 files changed, 66 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94d2d8ba1..7fd30f444 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,7 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +version = 3 + [[package]] name = "actix-codec" version = "0.3.0" @@ -1845,7 +1847,7 @@ dependencies = [ "log", "main_error", "meilisearch-error", - "meilisearch-tokenizer", + "meilisearch-tokenizer 0.1.1 (git+https://github.com/meilisearch/Tokenizer.git?branch=main)", "memmap", "milli", "mime", @@ -1875,6 +1877,22 @@ dependencies = [ "vergen", ] +[[package]] +name = "meilisearch-tokenizer" +version = "0.1.1" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.0#833c48b2ee39071f8b4f51abd15122afdb3c8c06" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang", +] + [[package]] name = "meilisearch-tokenizer" version = "0.1.1" @@ -1919,7 +1937,7 @@ dependencies = [ [[package]] name = "milli" version = "0.1.0" -source = "git+https://github.com/meilisearch/milli.git?rev=b7b23cd#b7b23cd4a8e62932c66c2ebedf9d89ddf089e299" +source = "git+https://github.com/meilisearch/milli.git?rev=2bcdd8844c4ec9f6f8a34617ea0e4321fa633c0c#2bcdd8844c4ec9f6f8a34617ea0e4321fa633c0c" dependencies = [ "anyhow", "bstr", @@ -1939,7 +1957,7 @@ dependencies = [ "linked-hash-map", "log", "logging_timer", - "meilisearch-tokenizer", + "meilisearch-tokenizer 0.1.1 (git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.0)", "memmap", "num-traits", "obkv", @@ -2234,8 +2252,7 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" dependencies = [ "ucd-trie", ] @@ -2243,7 +2260,8 @@ dependencies = [ [[package]] name = "pest" version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" dependencies = [ "ucd-trie", ] diff --git a/meilisearch-http/Cargo.toml b/meilisearch-http/Cargo.toml index a7564f4d9..03ae35729 100644 --- a/meilisearch-http/Cargo.toml +++ b/meilisearch-http/Cargo.toml @@ -42,7 +42,7 @@ main_error = "0.1.0" meilisearch-error = { path = "../meilisearch-error" } meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } memmap = "0.7.0" -milli = { git = "https://github.com/meilisearch/milli.git", rev = "b7b23cd" } +milli = { git = "https://github.com/meilisearch/milli.git", rev = "2bcdd8844c4ec9f6f8a34617ea0e4321fa633c0c" } mime = "0.3.16" once_cell = "1.5.2" parking_lot = "0.11.1" diff --git a/meilisearch-http/src/index/mod.rs b/meilisearch-http/src/index/mod.rs index 188afd522..dfd2ebdc4 100644 --- a/meilisearch-http/src/index/mod.rs +++ b/meilisearch-http/src/index/mod.rs @@ -1,7 +1,7 @@ mod search; mod updates; -use std::collections::HashSet; +use std::collections::{BTreeSet, HashSet}; use std::ops::Deref; use std::sync::Arc; @@ -51,11 +51,24 @@ impl Index { .map(|c| c.to_string()) .collect(); + let stop_words = self + .stop_words(&txn)? + .map(|stop_words| -> anyhow::Result> { + Ok(stop_words + .stream() + .into_strs()? + .into_iter() + .collect()) + }) + .transpose()? + .unwrap_or_else(BTreeSet::new); + Ok(Settings { displayed_attributes: Some(Some(displayed_attributes)), searchable_attributes: Some(Some(searchable_attributes)), attributes_for_faceting: Some(Some(faceted_attributes)), ranking_rules: Some(Some(criteria)), + stop_words: Some(Some(stop_words)), }) } diff --git a/meilisearch-http/src/index/updates.rs b/meilisearch-http/src/index/updates.rs index 79558dd92..085115af6 100644 --- a/meilisearch-http/src/index/updates.rs +++ b/meilisearch-http/src/index/updates.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::io; use std::num::NonZeroUsize; @@ -44,8 +44,12 @@ pub struct Settings { )] pub ranking_rules: Option>>, - // TODO we are missing the stopWords, synonyms and distinctAttribute for the GET settings - // request + #[serde( + default, + deserialize_with = "deserialize_some", + skip_serializing_if = "Option::is_none" + )] + pub stop_words: Option>>, } impl Settings { @@ -55,6 +59,7 @@ impl Settings { searchable_attributes: Some(None), attributes_for_faceting: Some(None), ranking_rules: Some(None), + stop_words: Some(None), } } } @@ -170,6 +175,14 @@ impl Index { } } + // We transpose the settings JSON struct into a real setting update. + if let Some(ref stop_words) = settings.stop_words { + match stop_words { + Some(stop_words) => builder.set_stop_words(stop_words.clone()), + _ => builder.reset_stop_words(), + } + } + let result = builder .execute(|indexing_step, update_id| info!("update {}: {:?}", update_id, indexing_step)); diff --git a/meilisearch-http/src/routes/settings/mod.rs b/meilisearch-http/src/routes/settings/mod.rs index 8c6e04b84..732888ec2 100644 --- a/meilisearch-http/src/routes/settings/mod.rs +++ b/meilisearch-http/src/routes/settings/mod.rs @@ -91,6 +91,12 @@ make_setting_route!( searchable_attributes ); +make_setting_route!( + "/indexes/{index_uid}/settings/stop-words", + std::collections::BTreeSet, + stop_words +); + //make_setting_route!( //"/indexes/{index_uid}/settings/distinct-attribute", //String, @@ -122,7 +128,8 @@ macro_rules! create_services { create_services!( attributes_for_faceting, displayed_attributes, - searchable_attributes + searchable_attributes, + stop_words ); #[post("/indexes/{index_uid}/settings", wrap = "Authentication::Private")] diff --git a/meilisearch-http/tests/settings/get_settings.rs b/meilisearch-http/tests/settings/get_settings.rs index d234cbb2b..82554ee22 100644 --- a/meilisearch-http/tests/settings/get_settings.rs +++ b/meilisearch-http/tests/settings/get_settings.rs @@ -16,21 +16,21 @@ async fn get_settings() { let (response, code) = index.settings().await; assert_eq!(code, 200); let settings = response.as_object().unwrap(); - assert_eq!(settings.keys().len(), 4); + assert_eq!(settings.keys().len(), 5); assert_eq!(settings["displayedAttributes"], json!(["*"])); assert_eq!(settings["searchableAttributes"], json!(["*"])); assert_eq!(settings["attributesForFaceting"], json!({})); assert_eq!( settings["rankingRules"], json!([ - "typo", "words", + "typo", "proximity", "attribute", - "wordsPosition", "exactness" ]) ); + assert_eq!(settings["stopWords"], json!(null)); } #[actix_rt::test]