Make the RawIndexer support stop words

2025-01-18 08:48:32 +08:00 · 2019-10-29 15:53:45 +01:00 · 2019-10-29 15:53:45 +01:00 · ff7dde7522
commit ff7dde7522
parent a226fd23c3
2 changed files with 28 additions and 13 deletions
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@ -11,6 +11,7 @@ type Word = Vec<u8>; // TODO make it be a SmallVec

 pub struct RawIndexer {
    word_limit: usize, // the maximum number of indexed words
+    stop_words: fst::Set,
    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
    docs_words: HashMap<DocumentId, Vec<Word>>,
 }
@ -21,13 +22,14 @@ pub struct Indexed {
 }

 impl RawIndexer {
-    pub fn new() -> RawIndexer {
-        RawIndexer::with_word_limit(1000)
+    pub fn new(stop_words: fst::Set) -> RawIndexer {
+        RawIndexer::with_word_limit(stop_words, 1000)
    }

-    pub fn with_word_limit(limit: usize) -> RawIndexer {
+    pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
        RawIndexer {
            word_limit: limit,
+            stop_words,
            words_doc_indexes: BTreeMap::new(),
            docs_words: HashMap::new(),
        }
@ -56,6 +58,7 @@ impl RawIndexer {
                    id,
                    attr,
                    self.word_limit,
+                    &self.stop_words,
                    &mut self.words_doc_indexes,
                    &mut self.docs_words,
                );
@ -87,6 +90,7 @@ impl RawIndexer {
                id,
                attr,
                self.word_limit,
+                &self.stop_words,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );
@ -118,6 +122,7 @@ impl RawIndexer {
                id,
                attr,
                self.word_limit,
+                &self.stop_words,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );
@ -152,17 +157,12 @@ impl RawIndexer {
    }
 }

-impl Default for RawIndexer {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 fn index_token(
    token: Token,
    id: DocumentId,
    attr: SchemaAttr,
    word_limit: usize,
+    stop_words: &fst::Set,
    words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
    docs_words: &mut HashMap<DocumentId, Vec<Word>>,
 ) -> bool {
@ -170,6 +170,10 @@ fn index_token(
        return false;
    }

+    if stop_words.contains(&token.word) {
+        return false;
+    }
+
    match token_to_docindex(id, attr, token) {
        Some(docindex) => {
            let word = Vec::from(token.word);
@ -207,7 +211,7 @@ mod tests {

    #[test]
    fn strange_apostrophe() {
-        let mut indexer = RawIndexer::new();
+        let mut indexer = RawIndexer::new(fst::Set::default());

        let docid = DocumentId(0);
        let attr = SchemaAttr(0);
@ -231,7 +235,7 @@ mod tests {

    #[test]
    fn strange_apostrophe_in_sequence() {
-        let mut indexer = RawIndexer::new();
+        let mut indexer = RawIndexer::new(fst::Set::default());

        let docid = DocumentId(0);
        let attr = SchemaAttr(0);
--- a/meilidb-core/src/update/documents_addition.rs
+++ b/meilidb-core/src/update/documents_addition.rs
@ -87,7 +87,6 @@ pub fn apply_documents_addition(
    addition: Vec<serde_json::Value>,
 ) -> MResult<()> {
    let mut documents_additions = HashMap::new();
-    let mut indexer = RawIndexer::new();

    let schema = match main_store.schema(writer)? {
        Some(schema) => schema,
@ -124,7 +123,14 @@ pub fn apply_documents_addition(
        None => RankedMap::default(),
    };

+    let stop_words = match main_store.stop_words_fst(writer)? {
+        Some(stop_words) => stop_words,
+        None => fst::Set::default(),
+    };
+
    // 3. index the documents fields in the stores
+    let mut indexer = RawIndexer::new(stop_words);
+
    for (document_id, document) in documents_additions {
        let serializer = Serializer {
            txn: writer,
@ -180,8 +186,13 @@ pub fn reindex_all_documents(
    postings_lists_store.clear(writer)?;
    docs_words_store.clear(writer)?;

+    let stop_words = match main_store.stop_words_fst(writer)? {
+        Some(stop_words) => stop_words,
+        None => fst::Set::default(),
+    };
+
    // 3. re-index one document by one document (otherwise we make the borrow checker unhappy)
-    let mut indexer = RawIndexer::new();
+    let mut indexer = RawIndexer::new(stop_words);
    let mut ram_store = HashMap::new();

    for document_id in documents_ids_to_reindex {