Merge pull request #232 from meilisearch/support-splitted-words

Support splitted words
2024-11-27 04:25:06 +08:00 · 2019-10-23 13:38:16 +02:00 · 2019-10-23 13:38:16 +02:00 · 93306c2326
commit 93306c2326
parent 0fbd4cd632 7d9cf8d713
2 changed files with 243 additions and 59 deletions
--- a/meilidb-core/src/automaton/mod.rs
+++ b/meilidb-core/src/automaton/mod.rs
@ -2,7 +2,7 @@ mod dfa;
 mod query_enhancer;
 use std::cmp::Reverse;
-use std::vec;
+use std::{cmp, vec};
 use fst::{IntoStreamer, Streamer};
 use levenshtein_automata::DFA;
@ -18,7 +18,7 @@ use self::query_enhancer::QueryEnhancerBuilder;
 const NGRAMS: usize = 3;
 pub struct AutomatonProducer {
-    automatons: Vec<Vec<Automaton>>,
+    automatons: Vec<AutomatonGroup>,
 }
 impl AutomatonProducer {
@ -26,19 +26,47 @@ impl AutomatonProducer {
        reader: &heed::RoTxn,
        query: &str,
        main_store: store::Main,
        postings_list_store: store::PostingsLists,
        synonyms_store: store::Synonyms,
    ) -> MResult<(AutomatonProducer, QueryEnhancer)> {
-        let (automatons, query_enhancer) =
+        let (automatons, query_enhancer) = generate_automatons(
-            generate_automatons(reader, query, main_store, synonyms_store)?;
+            reader,
            query,
            main_store,
            postings_list_store,
            synonyms_store,
        )?;
        Ok((AutomatonProducer { automatons }, query_enhancer))
    }
-    pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> {
+    pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
        self.automatons.into_iter()
    }
 }
 #[derive(Debug)]
 pub struct AutomatonGroup {
    pub is_phrase_query: bool,
    pub automatons: Vec<Automaton>,
 }
 impl AutomatonGroup {
    fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
        AutomatonGroup {
            is_phrase_query: false,
            automatons,
        }
    }
    fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
        AutomatonGroup {
            is_phrase_query: true,
            automatons,
        }
    }
 }
 #[derive(Debug)]
 pub struct Automaton {
    pub index: usize,
@ -102,12 +130,41 @@ pub fn normalize_str(string: &str) -> String {
    string
 }
 fn split_best_frequency<'a>(
    reader: &heed::RoTxn,
    word: &'a str,
    postings_lists_store: store::PostingsLists,
 ) -> MResult<Option<(&'a str, &'a str)>> {
    let chars = word.char_indices().skip(1);
    let mut best = None;
    for (i, _) in chars {
        let (left, right) = word.split_at(i);
        let left_freq = postings_lists_store
            .postings_list(reader, left.as_ref())?
            .map_or(0, |i| i.len());
        let right_freq = postings_lists_store
            .postings_list(reader, right.as_ref())?
            .map_or(0, |i| i.len());
        let min_freq = cmp::min(left_freq, right_freq);
        if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
            best = Some((min_freq, left, right));
        }
    }
    Ok(best.map(|(_, l, r)| (l, r)))
 }
 fn generate_automatons(
    reader: &heed::RoTxn,
    query: &str,
    main_store: store::Main,
    postings_lists_store: store::PostingsLists,
    synonym_store: store::Synonyms,
-) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
+) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
    let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
    let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
    let synonyms = match main_store.synonyms_fst(reader)? {
@ -136,7 +193,7 @@ fn generate_automatons(
        original_automatons.push(automaton);
    }
-    automatons.push(original_automatons);
+    automatons.push(AutomatonGroup::normal(original_automatons));
    for n in 1..=NGRAMS {
        let mut ngrams = query_words.windows(n).enumerate().peekable();
@ -188,13 +245,27 @@ fn generate_automatons(
                                Automaton::non_exact(automaton_index, n, synonym)
                            };
                            automaton_index += 1;
-                            automatons.push(vec![automaton]);
+                            automatons.push(AutomatonGroup::normal(vec![automaton]));
                        }
                    }
                }
            }
-            if n != 1 {
+            if n == 1 {
                if let Some((left, right)) =
                    split_best_frequency(reader, &normalized, postings_lists_store)?
                {
                    let a = Automaton::exact(automaton_index, 1, left);
                    enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
                    automaton_index += 1;
                    let b = Automaton::exact(automaton_index, 1, right);
                    enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
                    automaton_index += 1;
                    automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
                }
            } else {
                // automaton of concatenation of query words
                let concat = ngram_slice.concat();
                let normalized = normalize_str(&concat);
@ -204,15 +275,15 @@ fn generate_automatons(
                let automaton = Automaton::exact(automaton_index, n, &normalized);
                automaton_index += 1;
-                automatons.push(vec![automaton]);
+                automatons.push(AutomatonGroup::normal(vec![automaton]));
            }
        }
    }
    // order automatons, the most important first,
    // we keep the original automatons at the front.
-    automatons[1..].sort_by_key(|a| {
+    automatons[1..].sort_by_key(|group| {
-        let a = a.first().unwrap();
+        let a = group.automatons.first().unwrap();
        (Reverse(a.is_exact), a.ngram)
    });
--- a/meilidb-core/src/query_builder.rs
+++ b/meilidb-core/src/query_builder.rs
@ -8,7 +8,7 @@ use fst::{IntoStreamer, Streamer};
 use sdset::SetBuf;
 use slice_group_by::{GroupBy, GroupByMut};
-use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
+use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
 use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
 use crate::raw_document::{raw_documents_from, RawDocument};
 use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
@ -138,7 +138,7 @@ fn multiword_rewrite_matches(
 fn fetch_raw_documents(
    reader: &heed::RoTxn,
-    automatons: &[Automaton],
+    automatons_groups: &[AutomatonGroup],
    query_enhancer: &QueryEnhancer,
    searchables: Option<&ReorderedAttrs>,
    main_store: store::Main,
@ -148,55 +148,94 @@ fn fetch_raw_documents(
    let mut matches = Vec::new();
    let mut highlights = Vec::new();
-    for automaton in automatons {
+    for group in automatons_groups {
-        let Automaton {
+        let AutomatonGroup {
-            index,
+            is_phrase_query,
-            is_exact,
+            automatons,
-            query_len,
+        } = group;
-            ..
+        let phrase_query_len = automatons.len();
        } = automaton;
        let dfa = automaton.dfa();
-        let words = match main_store.words_fst(reader)? {
+        let mut tmp_matches = Vec::new();
-            Some(words) => words,
+        for (id, automaton) in automatons.into_iter().enumerate() {
-            None => return Ok(Vec::new()),
+            let Automaton {
-        };
+                index,
                is_exact,
                query_len,
                ..
            } = automaton;
            let dfa = automaton.dfa();
-        let mut stream = words.search(&dfa).into_stream();
+            let words = match main_store.words_fst(reader)? {
-        while let Some(input) = stream.next() {
+                Some(words) => words,
-            let distance = dfa.eval(input).to_u8();
+                None => return Ok(Vec::new()),
            let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
            let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
                Some(doc_indexes) => doc_indexes,
                None => continue,
            };
-            matches.reserve(doc_indexes.len());
+            let mut stream = words.search(&dfa).into_stream();
-            highlights.reserve(doc_indexes.len());
+            while let Some(input) = stream.next() {
                let distance = dfa.eval(input).to_u8();
                let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
-            for di in doc_indexes.as_ref() {
+                let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
-                let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
+                    Some(doc_indexes) => doc_indexes,
-                if let Some(attribute) = attribute {
+                    None => continue,
-                    let match_ = TmpMatch {
+                };
                        query_index: *index as u32,
                        distance,
                        attribute,
                        word_index: di.word_index,
                        is_exact,
                    };
-                    let highlight = Highlight {
+                tmp_matches.reserve(doc_indexes.len());
                        attribute: di.attribute,
                        char_index: di.char_index,
                        char_length: di.char_length,
                    };
-                    matches.push((di.document_id, match_));
+                for di in doc_indexes.as_ref() {
-                    highlights.push((di.document_id, highlight));
+                    let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
                    if let Some(attribute) = attribute {
                        let match_ = TmpMatch {
                            query_index: *index as u32,
                            distance,
                            attribute,
                            word_index: di.word_index,
                            is_exact,
                        };
                        let highlight = Highlight {
                            attribute: di.attribute,
                            char_index: di.char_index,
                            char_length: di.char_length,
                        };
                        tmp_matches.push((di.document_id, id, match_, highlight));
                    }
                }
            }
        }
        if *is_phrase_query {
            tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
            for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
                for window in group.windows(2) {
                    let (ida, ia, ma, ha) = window[0];
                    let (idb, ib, mb, hb) = window[1];
                    debug_assert_eq!(ida, idb);
                    // if matches must follow and actually follows themselves
                    if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
                        // TODO we must make it work for phrase query longer than 2
                        // if the second match is the last phrase query word
                        if ib + 1 == phrase_query_len {
                            // insert first match
                            matches.push((ida, ma));
                            highlights.push((ida, ha));
                            // insert second match
                            matches.push((idb, mb));
                            highlights.push((idb, hb));
                        }
                    }
                }
            }
        } else {
            for (id, _, match_, highlight) in tmp_matches {
                matches.push((id, match_));
                highlights.push((id, highlight));
            }
        }
    }
    let matches = multiword_rewrite_matches(matches, &query_enhancer);
@ -367,15 +406,20 @@ where
    let start_processing = Instant::now();
    let mut raw_documents_processed = Vec::with_capacity(range.len());
-    let (automaton_producer, query_enhancer) =
+    let (automaton_producer, query_enhancer) = AutomatonProducer::new(
-        AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
+        reader,
        query,
        main_store,
        postings_lists_store,
        synonyms_store,
    )?;
    let automaton_producer = automaton_producer.into_iter();
    let mut automatons = Vec::new();
    // aggregate automatons groups by groups after time
    for auts in automaton_producer {
-        automatons.extend(auts);
+        automatons.push(auts);
        // we must retrieve the documents associated
        // with the current automatons
@ -480,15 +524,20 @@ where
    let start_processing = Instant::now();
    let mut raw_documents_processed = Vec::new();
-    let (automaton_producer, query_enhancer) =
+    let (automaton_producer, query_enhancer) = AutomatonProducer::new(
-        AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
+        reader,
        query,
        main_store,
        postings_lists_store,
        synonyms_store,
    )?;
    let automaton_producer = automaton_producer.into_iter();
    let mut automatons = Vec::new();
    // aggregate automatons groups by groups after time
    for auts in automaton_producer {
-        automatons.extend(auts);
+        automatons.push(auts);
        // we must retrieve the documents associated
        // with the current automatons
@ -1697,4 +1746,68 @@ mod tests {
        });
        assert_matches!(iter.next(), None);
    }
    #[test]
    fn simple_phrase_query_splitting() {
        let store = TempDatabase::from_iter(vec![
            ("search", &[doc_index(0, 0)][..]),
            ("engine", &[doc_index(0, 1)][..]),
            ("search", &[doc_index(1, 0)][..]),
            ("slow", &[doc_index(1, 1)][..]),
            ("engine", &[doc_index(1, 2)][..]),
        ]);
        let env = &store.database.env;
        let reader = env.read_txn().unwrap();
        let builder = store.query_builder();
        let results = builder.query(&reader, "searchengine", 0..20).unwrap();
        let mut iter = results.into_iter();
        assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
            let mut iter = matches.into_iter();
            assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
            assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine
            assert_matches!(iter.next(), None);
        });
        assert_matches!(iter.next(), None);
    }
    #[test]
    fn harder_phrase_query_splitting() {
        let store = TempDatabase::from_iter(vec![
            ("search", &[doc_index(0, 0)][..]),
            ("search", &[doc_index(0, 1)][..]),
            ("engine", &[doc_index(0, 2)][..]),
            ("search", &[doc_index(1, 0)][..]),
            ("slow", &[doc_index(1, 1)][..]),
            ("search", &[doc_index(1, 2)][..]),
            ("engine", &[doc_index(1, 3)][..]),
            ("search", &[doc_index(1, 0)][..]),
            ("search", &[doc_index(1, 1)][..]),
            ("slow", &[doc_index(1, 2)][..]),
            ("engine", &[doc_index(1, 3)][..]),
        ]);
        let env = &store.database.env;
        let reader = env.read_txn().unwrap();
        let builder = store.query_builder();
        let results = builder.query(&reader, "searchengine", 0..20).unwrap();
        let mut iter = results.into_iter();
        assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
            let mut iter = matches.into_iter();
            assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search
            assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine
            assert_matches!(iter.next(), None);
        });
        assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
            let mut iter = matches.into_iter();
            assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search
            assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine
            assert_matches!(iter.next(), None);
        });
        assert_matches!(iter.next(), None);
    }
 }