implement a first version of the stop_words

The front must provide a BTreeSet containing the stop words The stop_words are set at None if an empty Set is provided add the stop-words in the http-ui interface Use maplit in the test and remove all the useless drop(rtxn) at the end of all tests
2024-11-30 09:04:59 +08:00 · 2021-03-29 19:15:47 +02:00 · 2021-03-29 19:15:47 +02:00 · a2f46029c7
commit a2f46029c7
parent 62a8f1d707
7 changed files with 203 additions and 56 deletions
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, HashMap, HashSet};
+use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::fmt::Display;
 use std::fs::{File, create_dir_all};
 use std::net::SocketAddr;
@ -128,7 +128,10 @@ struct Highlighter<'a, A> {

 impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
    fn new(stop_words: &'a fst::Set<A>) -> Self {
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let mut config = AnalyzerConfig::default();
+        config.stop_words(stop_words);
+        let analyzer = Analyzer::new(config);
+
        Self { analyzer }
    }

@ -266,6 +269,13 @@ struct Settings {
        skip_serializing_if = "Option::is_none",
    )]
    criteria: Option<Option<Vec<String>>>,
+
+    #[serde(
+        default,
+        deserialize_with = "deserialize_some",
+        skip_serializing_if = "Option::is_none",
+    )]
+    stop_words: Option<Option<BTreeSet<String>>>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -439,6 +449,14 @@ async fn main() -> anyhow::Result<()> {
                        }
                    }

+                    // We transpose the settings JSON struct into a real setting update.
+                    if let Some(stop_words) = settings.stop_words {
+                        match stop_words {
+                            Some(stop_words) => builder.set_stop_words(stop_words),
+                            None => builder.reset_stop_words(),
+                        }
+                    }
+
                    let result = builder.execute(|indexing_step, update_id| {
                        let (current, total) = match indexing_step {
                            TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -28,6 +28,7 @@ pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
 pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
 pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
 pub const WORDS_FST_KEY: &str = "words-fst";
+pub const STOP_WORDS_KEY: &str = "stop-words";
 pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
 const CREATED_AT_KEY: &str = "created-at";
 const UPDATED_AT_KEY: &str = "updated-at";
@ -377,6 +378,22 @@ impl Index {
        }
    }

+    /* stop words */
+
+    pub fn put_stop_words<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
+        self.main.put::<_, Str, ByteSlice>(wtxn, STOP_WORDS_KEY, fst.as_fst().as_bytes())
+    }
+
+    pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
+        self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY)
+    }
+    pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result<Option<fst::Set<&'t [u8]>>> {
+        match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? {
+            Some(bytes) => Ok(Some(fst::Set::new(bytes)?)),
+            None => Ok(None),
+        }
+    }
+
    /* words prefixes fst */

    /// Writes the FST which is the words prefixes dictionnary of the engine.
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -4,7 +4,7 @@ use std::fmt;
 use std::str::Utf8Error;
 use std::time::Instant;

-use fst::{IntoStreamer, Streamer, Set};
+use fst::{IntoStreamer, Streamer};
 use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder};
 use log::debug;
 use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
@ -91,8 +91,7 @@ impl<'a> Search<'a> {
                let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
                builder.optional_words(self.optional_words);
                builder.authorize_typos(self.authorize_typos);
-                let stop_words = &Set::default();
-                let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+                let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default());
                let result = analyzer.analyze(query);
                let tokens = result.tokens();
                builder.build(tokens)?
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@ -543,7 +543,6 @@ pub fn maximum_proximity(operation: &Operation) -> usize {
 mod test {
    use std::collections::HashMap;

-    use fst::Set;
    use maplit::{hashmap, hashset};
    use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
    use rand::{Rng, SeedableRng, rngs::StdRng};
@ -646,8 +645,7 @@ mod test {
    #[test]
    fn prefix() {
        let query = "hey friends";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -667,8 +665,7 @@ mod test {
    #[test]
    fn no_prefix() {
        let query = "hey friends ";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -688,8 +685,7 @@ mod test {
    #[test]
    fn synonyms() {
        let query = "hello world ";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -720,8 +716,7 @@ mod test {
    #[test]
    fn complex_synonyms() {
        let query = "new york city ";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -766,8 +761,7 @@ mod test {
    #[test]
    fn ngrams() {
        let query = "n grams ";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -787,8 +781,7 @@ mod test {
    #[test]
    fn word_split() {
        let query = "wordsplit fish ";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -814,8 +807,7 @@ mod test {
    #[test]
    fn phrase() {
        let query = "\"hey friends\" \" \" \"wooop";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -835,8 +827,7 @@ mod test {
    #[test]
    fn optional_word() {
        let query = "hey my friend ";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -875,8 +866,7 @@ mod test {
    #[test]
    fn optional_word_phrase() {
        let query = "\"hey my\"";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -892,8 +882,7 @@ mod test {
    #[test]
    fn optional_word_multiple_phrases() {
        let query = r#""hey" my good "friend""#;
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -927,8 +916,7 @@ mod test {
    #[test]
    fn no_typo() {
        let query = "hey friends ";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

@ -947,8 +935,7 @@ mod test {
    #[test]
    fn fetching_words() {
        let query = "wordsplit nyc world";
-        let stop_words = &Set::default();
-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();

--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -410,6 +410,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
            None => fields_ids_map.iter().map(|(id, _name)| id).collect(),
        };

+        let stop_words = self.index.stop_words(self.wtxn)?;
+        let stop_words = stop_words.as_ref();
        let linked_hash_map_size = self.linked_hash_map_size;
        let max_nb_chunks = self.max_nb_chunks;
        let max_memory = self.max_memory;
@ -436,7 +438,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
            let readers = rayon::iter::repeatn(documents, num_threads)
                .enumerate()
                .map(|(i, documents)| {
-                    let stop_words = fst::Set::default();
                    let store = Store::new(
                        searchable_fields.clone(),
                        faceted_fields.clone(),
@ -446,7 +447,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
                        chunk_compression_type,
                        chunk_compression_level,
                        chunk_fusing_shrink_size,
-                        &stop_words,
+                        stop_words,
                    )?;
                    store.index(
                        documents,
--- a/milli/src/update/index_documents/store.rs
+++ b/milli/src/update/index_documents/store.rs
@ -86,7 +86,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
        chunk_compression_type: CompressionType,
        chunk_compression_level: Option<u32>,
        chunk_fusing_shrink_size: Option<u64>,
-        stop_words: &'s Set<A>,
+        stop_words: Option<&'s Set<A>>,
    ) -> anyhow::Result<Self>
    {
        // We divide the max memory by the number of sorter the Store have.
@ -141,7 +141,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
            create_writer(chunk_compression_type, chunk_compression_level, f)
        })?;

-        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+        let mut config = AnalyzerConfig::default();
+        if let Some(stop_words) = stop_words {
+            config.stop_words(stop_words);
+        }
+        let analyzer = Analyzer::new(config);

        Ok(Store {
            // Indexing parameters.
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{BTreeSet, HashMap};
 use std::str::FromStr;

 use anyhow::Context;
@ -32,6 +32,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
    displayed_fields: Option<Option<Vec<String>>>,
    faceted_fields: Option<Option<HashMap<String, String>>>,
    criteria: Option<Option<Vec<String>>>,
+    stop_words: Option<Option<BTreeSet<String>>>,
 }

 impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@ -55,6 +56,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
            displayed_fields: None,
            faceted_fields: None,
            criteria: None,
+            stop_words: None,
            update_id,
        }
    }
@ -91,6 +93,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        self.criteria = Some(Some(criteria));
    }

+    pub fn reset_stop_words(&mut self) {
+        self.stop_words = Some(None);
+    }
+
+    pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) {
+        self.stop_words = if stop_words.is_empty() {
+            Some(None)
+        } else {
+            Some(Some(stop_words))
+        }
+    }
+
    fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()>
    where
        F: Fn(UpdateIndexingStep, u64) + Sync
@ -210,6 +224,28 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        Ok(true)
    }

+    fn update_stop_words(&mut self) -> anyhow::Result<bool> {
+        match self.stop_words {
+            Some(Some(ref stop_words)) => {
+                let current = self.index.stop_words(self.wtxn)?;
+                // since we can't compare a BTreeSet with an FST we are going to convert the
+                // BTreeSet to an FST and then compare bytes per bytes the two FSTs.
+                let fst = fst::Set::from_iter(&*stop_words)?;
+
+                // Does the new FST differ from the previous one?
+                if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) {
+                    // we want to re-create our FST.
+                    self.index.put_stop_words(self.wtxn, &fst)?;
+                    Ok(true)
+                } else {
+                    Ok(false)
+                }
+            }
+            Some(None) => Ok(self.index.delete_stop_words(self.wtxn)?),
+            None => Ok(false),
+        }
+    }
+
    fn update_facets(&mut self) -> anyhow::Result<bool> {
        match self.faceted_fields {
            Some(Some(ref fields)) => {
@ -253,13 +289,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        self.index.set_updated_at(self.wtxn, &Utc::now())?;
        let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?;
        self.update_displayed()?;
+        let stop_words_updated = self.update_stop_words()?;
        let facets_updated = self.update_facets()?;
        // update_criteria MUST be called after update_facets, since criterion fields must be set
        // as facets.
        self.update_criteria()?;
        let searchable_updated = self.update_searchable()?;

-            if facets_updated || searchable_updated {
+        if facets_updated || searchable_updated || stop_words_updated {
            self.reindex(&progress_callback, old_fields_ids_map)?;
        }
        Ok(())
@ -271,7 +308,7 @@ mod tests {
    use super::*;

    use heed::EnvOpenOptions;
-    use maplit::hashmap;
+    use maplit::{hashmap, btreeset};

    use crate::facet::FacetType;
    use crate::update::{IndexDocuments, UpdateFormat};
@ -328,7 +365,6 @@ mod tests {
        assert_eq!(result.documents_ids.len(), 1);
        let documents = index.documents(&rtxn, result.documents_ids).unwrap();
        assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..]));
-        drop(rtxn);
    }

    #[test]
@ -372,7 +408,6 @@ mod tests {
        let rtxn = index.read_txn().unwrap();
        let fields_ids = index.displayed_fields(&rtxn).unwrap();
        assert_eq!(fields_ids.unwrap(), &["age"][..]);
-        drop(rtxn);
    }

    #[test]
@ -394,7 +429,6 @@ mod tests {
        let rtxn = index.read_txn().unwrap();
        let fields_ids = index.displayed_fields(&rtxn).unwrap();
        assert_eq!(fields_ids, None);
-        drop(rtxn);
    }

    #[test]
@ -434,7 +468,6 @@ mod tests {
        let rtxn = index.read_txn().unwrap();
        let fields_ids = index.displayed_fields(&rtxn).unwrap();
        assert_eq!(fields_ids, None);
-        drop(rtxn);
    }

    #[test]
@ -478,7 +511,96 @@ mod tests {
        // Only count the field_id 0 and level 0 facet values.
        let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
        assert_eq!(count, 4);
-        drop(rtxn);
+    }
+
+    #[test]
+    fn default_stop_words() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // First we send 3 documents with ids from 1 to 3.
+        let mut wtxn = index.write_txn().unwrap();
+        let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
+        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
+        builder.update_format(UpdateFormat::Csv);
+        builder.execute(content, |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Ensure there is no stop_words by default
+        let rtxn = index.read_txn().unwrap();
+        let stop_words = index.stop_words(&rtxn).unwrap();
+        assert!(stop_words.is_none());
+    }
+
+    #[test]
+    fn set_and_reset_stop_words() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // First we send 3 documents with ids from 1 to 3.
+        let mut wtxn = index.write_txn().unwrap();
+        let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
+        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
+        builder.update_format(UpdateFormat::Csv);
+        builder.execute(content, |_, _| ()).unwrap();
+
+        // In the same transaction we provide some stop_words
+        let mut builder = Settings::new(&mut wtxn, &index, 0);
+        let set = btreeset!{ "i".to_string(), "the".to_string(), "are".to_string() };
+        builder.set_stop_words(set.clone());
+        builder.execute(|_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Ensure stop_words are effectively stored
+        let rtxn = index.read_txn().unwrap();
+        let stop_words = index.stop_words(&rtxn).unwrap();
+        assert!(stop_words.is_some()); // at this point the index should return something
+
+        let stop_words = stop_words.unwrap();
+        let expected = fst::Set::from_iter(&set).unwrap();
+        assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes());
+
+        // when we search for something that is a non prefix stop_words it should be ignored
+        let result = index.search(&rtxn).query("the ").execute().unwrap();
+        assert!(result.documents_ids.is_empty());
+        let result = index.search(&rtxn).query("i ").execute().unwrap();
+        assert!(result.documents_ids.is_empty());
+        let result = index.search(&rtxn).query("are ").execute().unwrap();
+        assert!(result.documents_ids.is_empty());
+
+        let result = index.search(&rtxn).query("dog").execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos
+        let result = index.search(&rtxn).query("benoît").execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data
+
+        // now we'll reset the stop_words and ensure it's None
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = Settings::new(&mut wtxn, &index, 0);
+        builder.reset_stop_words();
+        builder.execute(|_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let stop_words = index.stop_words(&rtxn).unwrap();
+        assert!(stop_words.is_none());
+
+        // now we can search for the stop words
+        let result = index.search(&rtxn).query("the").execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 2);
+        let result = index.search(&rtxn).query("i").execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 1);
+        let result = index.search(&rtxn).query("are").execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 2);
+
+        // the rest of the search is still not impacted
+        let result = index.search(&rtxn).query("dog").execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos
+        let result = index.search(&rtxn).query("benoît").execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data
    }

    #[test]
@ -519,6 +641,5 @@ mod tests {
        assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap());
        assert!(index.primary_key(&rtxn).unwrap().is_none());
        assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap());
-        drop(rtxn);
    }
 }