Enhance word splitting strategy

2025-02-20 17:45:54 +08:00 · 2022-10-12 13:18:23 +05:30 · 2022-10-12 13:18:23 +05:30 · 7f9680f0a0
commit 7f9680f0a0
parent 358aa337ea
1 changed files with 59 additions and 20 deletions
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@ -1,6 +1,6 @@
 use std::borrow::Cow;
 use std::cmp::max;
-use std::{cmp, fmt, mem};
+use std::{fmt, mem};

 use charabia::classifier::ClassifiedTokenIter;
 use charabia::{SeparatorKind, TokenKind};
@ -10,7 +10,7 @@ use slice_group_by::GroupBy;

 use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId};
 use crate::search::TermsMatchingStrategy;
-use crate::{Index, MatchingWords, Result};
+use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result};

 type IsOptionalWord = bool;
 type IsPrefix = bool;
@ -146,6 +146,7 @@ impl fmt::Debug for Query {

 trait Context {
    fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
+    fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
    fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
    fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
        match self.word_docids(word)? {
@ -156,6 +157,12 @@ trait Context {
    /// Returns the minimum word len for 1 and 2 typos.
    fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
    fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
+    fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> {
+        match self.word_pair_proximity_docids(right_word, left_word, proximity)? {
+            Some(rb) => Ok(Some(rb.len())),
+            None => Ok(None),
+        }
+    }
 }

 /// The query tree builder is the interface to build a query tree.
@ -173,6 +180,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
        self.index.word_docids.get(self.rtxn, word)
    }

+    fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
+        self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity))
+    }
+
    fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
        self.index.words_synonyms(self.rtxn, words)
    }
@ -190,6 +201,11 @@ impl<'a> Context for QueryTreeBuilder<'a> {
    fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
        self.exact_words.as_ref()
    }
+
+    fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> {
+        let key = (left_word, right_word, proximity);
+        self.index.word_pair_proximity_docids.remap_data_type::<CboRoaringBitmapLenCodec>().get(&self.rtxn, &key)
+    }
 }

 impl<'a> QueryTreeBuilder<'a> {
@ -274,12 +290,10 @@ fn split_best_frequency<'a>(
    for (i, _) in chars {
        let (left, right) = word.split_at(i);

-        let left_freq = ctx.word_documents_count(left)?.unwrap_or(0);
-        let right_freq = ctx.word_documents_count(right)?.unwrap_or(0);
+        let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0);

-        let min_freq = cmp::min(left_freq, right_freq);
-        if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
-            best = Some((min_freq, left, right));
+        if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) {
+            best = Some((pair_freq, left, right));
        }
    }

@ -824,6 +838,11 @@ mod test {
            Ok(self.postings.get(word).cloned())
        }

+        fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, _: u8) -> heed::Result<Option<RoaringBitmap>> {
+            let bitmap = self.postings.get(&format!("{} {}", left_word, right_word));
+            Ok(bitmap.cloned())
+        }
+
        fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
            let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
            Ok(self.synonyms.get(&words).cloned())
@ -881,19 +900,22 @@ mod test {
                    ],
                },
                postings: hashmap! {
-                    String::from("hello")      => random_postings(rng,   1500),
-                    String::from("hi")         => random_postings(rng,   4000),
-                    String::from("word")       => random_postings(rng,   2500),
-                    String::from("split")      => random_postings(rng,    400),
-                    String::from("ngrams")     => random_postings(rng,   1400),
-                    String::from("world")      => random_postings(rng, 15_000),
-                    String::from("earth")      => random_postings(rng,   8000),
-                    String::from("2021")       => random_postings(rng,    100),
-                    String::from("2020")       => random_postings(rng,    500),
-                    String::from("is")         => random_postings(rng, 50_000),
-                    String::from("this")       => random_postings(rng, 50_000),
-                    String::from("good")       => random_postings(rng,   1250),
-                    String::from("morning")    => random_postings(rng,    125),
+                    String::from("hello")           => random_postings(rng,   1500),
+                    String::from("hi")              => random_postings(rng,   4000),
+                    String::from("word")            => random_postings(rng,   2500),
+                    String::from("split")           => random_postings(rng,    400),
+                    String::from("ngrams")          => random_postings(rng,   1400),
+                    String::from("world")           => random_postings(rng, 15_000),
+                    String::from("earth")           => random_postings(rng,   8000),
+                    String::from("2021")            => random_postings(rng,    100),
+                    String::from("2020")            => random_postings(rng,    500),
+                    String::from("is")              => random_postings(rng, 50_000),
+                    String::from("this")            => random_postings(rng, 50_000),
+                    String::from("good")            => random_postings(rng,   1250),
+                    String::from("morning")         => random_postings(rng,    125),
+                    String::from("word split")      => random_postings(rng,   5000),
+                    String::from("quick brownfox")  => random_postings(rng,   7000),
+                    String::from("quickbrown fox")  => random_postings(rng,   8000),
                },
                exact_words,
            }
@ -1041,6 +1063,23 @@ mod test {
        "###);
    }

+    #[test]
+    fn word_split_choose_pair_with_max_freq() {
+        let query = "quickbrownfox";
+        let tokens = query.tokenize();
+
+        let (query_tree, _) = TestContext::default()
+            .build(TermsMatchingStrategy::All, true, None, tokens)
+            .unwrap()
+            .unwrap();
+
+        insta::assert_debug_snapshot!(query_tree, @r###"
+        OR
+          PHRASE ["quickbrown", "fox"]
+          PrefixTolerant { word: "quickbrownfox", max typo: 2 }
+        "###);
+    }
+
    #[test]
    fn phrase() {
        let query = "\"hey friends\" \" \" \"wooop";