diff --git a/Cargo.lock b/Cargo.lock index a53930367..8f296f2fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1206,7 +1206,7 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "meilisearch-tokenizer" version = "0.1.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.4#31ba3ff4a15501f12b7d37ac64ddce7c35a9757c" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.0#833c48b2ee39071f8b4f51abd15122afdb3c8c06" dependencies = [ "character_converter", "cow-utils", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 75e5daebf..02a799091 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = "0.10.6" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.4" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.0" } memmap = "0.7.0" milli = { path = "../milli" } once_cell = "1.5.2" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 86f965368..f068b5b9a 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{File, create_dir_all}; use std::net::SocketAddr; @@ -128,7 +128,10 @@ struct Highlighter<'a, A> { impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { fn new(stop_words: &'a fst::Set) -> Self { - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let mut config = AnalyzerConfig::default(); + config.stop_words(stop_words); + let analyzer = Analyzer::new(config); + Self { analyzer } } @@ -266,6 +269,13 @@ struct Settings { skip_serializing_if = "Option::is_none", )] criteria: Option>>, + + #[serde( + default, + deserialize_with = "deserialize_some", + skip_serializing_if = "Option::is_none", + )] + stop_words: Option>>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -439,6 +449,14 @@ async fn main() -> anyhow::Result<()> { } } + // We transpose the settings JSON struct into a real setting update. + if let Some(stop_words) = settings.stop_words { + match stop_words { + Some(stop_words) => builder.set_stop_words(stop_words), + None => builder.reset_stop_words(), + } + } + let result = builder.execute(|indexing_step, update_id| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d18628149..b1a54d22d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -20,7 +20,7 @@ heed = { version = "0.10.6", default-features = false, features = ["lmdb", "sync human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.4" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.0" } memmap = "0.7.0" num-traits = "0.2.14" obkv = "0.1.1" diff --git a/milli/src/index.rs b/milli/src/index.rs index 2e0d329ef..642ad4ab7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -28,6 +28,7 @@ pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; +pub const STOP_WORDS_KEY: &str = "stop-words"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; @@ -377,6 +378,22 @@ impl Index { } } + /* stop words */ + + pub fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, STOP_WORDS_KEY, fst.as_fst().as_bytes()) + } + + pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY) + } + pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? { + Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), + None => Ok(None), + } + } + /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7560fbf0a..c88800f38 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -4,7 +4,7 @@ use std::fmt; use std::str::Utf8Error; use std::time::Instant; -use fst::{IntoStreamer, Streamer, Set}; +use fst::{IntoStreamer, Streamer}; use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder}; use log::debug; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; @@ -91,8 +91,7 @@ impl<'a> Search<'a> { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); builder.authorize_typos(self.authorize_typos); - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::>::new(AnalyzerConfig::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); builder.build(tokens)? diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 114032eb8..fb5b5b87c 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use std::{fmt, cmp, mem}; +use fst::Set; use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; @@ -154,6 +155,10 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; + fn stop_words(&self) -> anyhow::Result>>; + fn is_stop_word(&self, word: &str) -> anyhow::Result { + Ok(self.stop_words()?.map_or(false, |s| s.contains(word))) + } fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { @@ -183,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn synonyms>(&self, _words: &[S]) -> heed::Result>>> { Ok(None) } + + fn stop_words(&self) -> anyhow::Result>> { + self.index.stop_words(self.rtxn) + } } impl<'a> QueryTreeBuilder<'a> { @@ -331,8 +340,7 @@ fn create_query_tree( optional_words: bool, authorize_typos: bool, query: PrimitiveQuery, -) -> anyhow::Result -{ +) -> anyhow::Result { /// Matches on the `PrimitiveQueryPart` and create an operation from it. fn resolve_primitive_part( ctx: &impl Context, @@ -350,7 +358,12 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } - children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); + + let is_stop_word = ctx.is_stop_word(&word)?; + let query = Query { prefix, kind: typos(word, authorize_typos) }; + if query.prefix || query.kind.is_tolerant() || !is_stop_word { + children.push(Operation::Query(query)); + } Ok(Operation::or(false, children)) }, // create a CONSECUTIVE operation wrapping all word in the phrase @@ -365,12 +378,11 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - ) -> anyhow::Result - { + ) -> anyhow::Result { const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) { + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { let mut or_op_children = Vec::new(); for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { @@ -381,23 +393,31 @@ fn create_query_tree( match group { [part] => { - let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; + let operation = + resolve_primitive_part(ctx, authorize_typos, part.clone())?; and_op_children.push(operation); - }, + } words => { - let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); - let words: Vec<_> = words.iter().filter_map(| part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }).collect(); + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); + + let is_stop_word = ctx.is_stop_word(&concat)?; let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; - operations.push(Operation::Query(query)); - and_op_children.push(Operation::or(false, operations)); + if query.prefix || query.kind.is_tolerant() || !is_stop_word { + operations.push(Operation::Query(query)); + and_op_children.push(Operation::or(false, operations)); + } } } @@ -543,7 +563,6 @@ pub fn maximum_proximity(operation: &Operation) -> usize { mod test { use std::collections::HashMap; - use fst::Set; use maplit::{hashmap, hashset}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::{Rng, SeedableRng, rngs::StdRng}; @@ -582,6 +601,10 @@ mod test { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) } + + fn stop_words(&self) -> anyhow::Result>> { + Ok(None) + } } impl Default for TestContext { @@ -646,8 +669,7 @@ mod test { #[test] fn prefix() { let query = "hey friends"; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -667,8 +689,7 @@ mod test { #[test] fn no_prefix() { let query = "hey friends "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -688,8 +709,7 @@ mod test { #[test] fn synonyms() { let query = "hello world "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -720,8 +740,7 @@ mod test { #[test] fn complex_synonyms() { let query = "new york city "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -766,8 +785,7 @@ mod test { #[test] fn ngrams() { let query = "n grams "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -787,8 +805,7 @@ mod test { #[test] fn word_split() { let query = "wordsplit fish "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -814,8 +831,7 @@ mod test { #[test] fn phrase() { let query = "\"hey friends\" \" \" \"wooop"; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -835,8 +851,7 @@ mod test { #[test] fn optional_word() { let query = "hey my friend "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -875,8 +890,7 @@ mod test { #[test] fn optional_word_phrase() { let query = "\"hey my\""; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -892,8 +906,7 @@ mod test { #[test] fn optional_word_multiple_phrases() { let query = r#""hey" my good "friend""#; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -927,8 +940,7 @@ mod test { #[test] fn no_typo() { let query = "hey friends "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -947,8 +959,7 @@ mod test { #[test] fn fetching_words() { let query = "wordsplit nyc world"; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a19d8c0a7..f4a7c7f25 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -410,6 +410,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { None => fields_ids_map.iter().map(|(id, _name)| id).collect(), }; + let stop_words = self.index.stop_words(self.wtxn)?; + let stop_words = stop_words.as_ref(); let linked_hash_map_size = self.linked_hash_map_size; let max_nb_chunks = self.max_nb_chunks; let max_memory = self.max_memory; @@ -436,7 +438,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let readers = rayon::iter::repeatn(documents, num_threads) .enumerate() .map(|(i, documents)| { - let stop_words = fst::Set::default(); let store = Store::new( searchable_fields.clone(), faceted_fields.clone(), @@ -446,7 +447,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, - &stop_words, + stop_words, )?; store.index( documents, diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 05767080a..03d91af24 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -86,7 +86,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { chunk_compression_type: CompressionType, chunk_compression_level: Option, chunk_fusing_shrink_size: Option, - stop_words: &'s Set, + stop_words: Option<&'s Set>, ) -> anyhow::Result { // We divide the max memory by the number of sorter the Store have. @@ -141,7 +141,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { create_writer(chunk_compression_type, chunk_compression_level, f) })?; - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let mut config = AnalyzerConfig::default(); + if let Some(stop_words) = stop_words { + config.stop_words(stop_words); + } + let analyzer = Analyzer::new(config); Ok(Store { // Indexing parameters. diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 7ce8b98c1..451447102 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::str::FromStr; use anyhow::Context; @@ -32,6 +32,7 @@ pub struct Settings<'a, 't, 'u, 'i> { displayed_fields: Option>>, faceted_fields: Option>>, criteria: Option>>, + stop_words: Option>>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -55,6 +56,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { displayed_fields: None, faceted_fields: None, criteria: None, + stop_words: None, update_id, } } @@ -91,6 +93,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.criteria = Some(Some(criteria)); } + pub fn reset_stop_words(&mut self) { + self.stop_words = Some(None); + } + + pub fn set_stop_words(&mut self, stop_words: BTreeSet) { + self.stop_words = if stop_words.is_empty() { + Some(None) + } else { + Some(Some(stop_words)) + } + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync @@ -210,6 +224,28 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } + fn update_stop_words(&mut self) -> anyhow::Result { + match self.stop_words { + Some(Some(ref stop_words)) => { + let current = self.index.stop_words(self.wtxn)?; + // since we can't compare a BTreeSet with an FST we are going to convert the + // BTreeSet to an FST and then compare bytes per bytes the two FSTs. + let fst = fst::Set::from_iter(&*stop_words)?; + + // Does the new FST differ from the previous one? + if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { + // we want to re-create our FST. + self.index.put_stop_words(self.wtxn, &fst)?; + Ok(true) + } else { + Ok(false) + } + } + Some(None) => Ok(self.index.delete_stop_words(self.wtxn)?), + None => Ok(false), + } + } + fn update_facets(&mut self) -> anyhow::Result { match self.faceted_fields { Some(Some(ref fields)) => { @@ -248,22 +284,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { pub fn execute(mut self, progress_callback: F) -> anyhow::Result<()> where - F: Fn(UpdateIndexingStep, u64) + Sync - { - self.index.set_updated_at(self.wtxn, &Utc::now())?; - let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; - self.update_displayed()?; - let facets_updated = self.update_facets()?; - // update_criteria MUST be called after update_facets, since criterion fields must be set - // as facets. - self.update_criteria()?; - let searchable_updated = self.update_searchable()?; + F: Fn(UpdateIndexingStep, u64) + Sync + { + self.index.set_updated_at(self.wtxn, &Utc::now())?; + let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; + self.update_displayed()?; + let stop_words_updated = self.update_stop_words()?; + let facets_updated = self.update_facets()?; + // update_criteria MUST be called after update_facets, since criterion fields must be set + // as facets. + self.update_criteria()?; + let searchable_updated = self.update_searchable()?; - if facets_updated || searchable_updated { - self.reindex(&progress_callback, old_fields_ids_map)?; - } - Ok(()) + if facets_updated || searchable_updated || stop_words_updated { + self.reindex(&progress_callback, old_fields_ids_map)?; } + Ok(()) + } } #[cfg(test)] @@ -271,7 +308,7 @@ mod tests { use super::*; use heed::EnvOpenOptions; - use maplit::hashmap; + use maplit::{hashmap, btreeset}; use crate::facet::FacetType; use crate::update::{IndexDocuments, UpdateFormat}; @@ -328,7 +365,6 @@ mod tests { assert_eq!(result.documents_ids.len(), 1); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); - drop(rtxn); } #[test] @@ -372,7 +408,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap(); assert_eq!(fields_ids.unwrap(), &["age"][..]); - drop(rtxn); } #[test] @@ -394,7 +429,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap(); assert_eq!(fields_ids, None); - drop(rtxn); } #[test] @@ -434,7 +468,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap(); assert_eq!(fields_ids, None); - drop(rtxn); } #[test] @@ -478,7 +511,96 @@ mod tests { // Only count the field_id 0 and level 0 facet values. let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 4); - drop(rtxn); + } + + #[test] + fn default_stop_words() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure there is no stop_words by default + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_none()); + } + + #[test] + fn set_and_reset_stop_words() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + + // In the same transaction we provide some stop_words + let mut builder = Settings::new(&mut wtxn, &index, 0); + let set = btreeset!{ "i".to_string(), "the".to_string(), "are".to_string() }; + builder.set_stop_words(set.clone()); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure stop_words are effectively stored + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_some()); // at this point the index should return something + + let stop_words = stop_words.unwrap(); + let expected = fst::Set::from_iter(&set).unwrap(); + assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes()); + + // when we search for something that is a non prefix stop_words it should be ignored + let result = index.search(&rtxn).query("the ").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("i ").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("are ").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + + let result = index.search(&rtxn).query("dog").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos + let result = index.search(&rtxn).query("benoît").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data + + // now we'll reset the stop_words and ensure it's None + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.reset_stop_words(); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_none()); + + // now we can search for the stop words + let result = index.search(&rtxn).query("the").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + let result = index.search(&rtxn).query("i").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("are").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + + // the rest of the search is still not impacted + let result = index.search(&rtxn).query("dog").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos + let result = index.search(&rtxn).query("benoît").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data } #[test] @@ -519,6 +641,5 @@ mod tests { assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap()); assert!(index.primary_key(&rtxn).unwrap().is_none()); assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap()); - drop(rtxn); } }