From 7f9680f0a0e2c5113ec29500066af2d47c3275f8 Mon Sep 17 00:00:00 2001
From: Akshay Kulkarni <akshayk.gj@gmail.com>
Date: Wed, 12 Oct 2022 13:18:23 +0530
Subject: [PATCH 1/8] Enhance word splitting strategy

---
 milli/src/search/query_tree.rs | 79 +++++++++++++++++++++++++---------
 1 file changed, 59 insertions(+), 20 deletions(-)
 mode change 100644 => 100755 milli/src/search/query_tree.rs
diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
old mode 100644
new mode 100755
index 1c60e41f7..b54fc5c9c
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -1,6 +1,6 @@
 use std::borrow::Cow;
 use std::cmp::max;
-use std::{cmp, fmt, mem};
+use std::{fmt, mem};
 
 use charabia::classifier::ClassifiedTokenIter;
 use charabia::{SeparatorKind, TokenKind};
@@ -10,7 +10,7 @@ use slice_group_by::GroupBy;
 
 use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId};
 use crate::search::TermsMatchingStrategy;
-use crate::{Index, MatchingWords, Result};
+use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result};
 
 type IsOptionalWord = bool;
 type IsPrefix = bool;
@@ -146,6 +146,7 @@ impl fmt::Debug for Query {
 
 trait Context {
     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
+    fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
     fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
     fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
         match self.word_docids(word)? {
@@ -156,6 +157,12 @@ trait Context {
     /// Returns the minimum word len for 1 and 2 typos.
     fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
     fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
+    fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> {
+        match self.word_pair_proximity_docids(right_word, left_word, proximity)? {
+            Some(rb) => Ok(Some(rb.len())),
+            None => Ok(None),
+        }
+    }
 }
 
 /// The query tree builder is the interface to build a query tree.
@@ -173,6 +180,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
         self.index.word_docids.get(self.rtxn, word)
     }
 
+    fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
+        self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity))
+    }
+
     fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
         self.index.words_synonyms(self.rtxn, words)
     }
@@ -190,6 +201,11 @@ impl<'a> Context for QueryTreeBuilder<'a> {
     fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
         self.exact_words.as_ref()
     }
+
+    fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> {
+        let key = (left_word, right_word, proximity);
+        self.index.word_pair_proximity_docids.remap_data_type::<CboRoaringBitmapLenCodec>().get(&self.rtxn, &key)
+    }
 }
 
 impl<'a> QueryTreeBuilder<'a> {
@@ -274,12 +290,10 @@ fn split_best_frequency<'a>(
     for (i, _) in chars {
         let (left, right) = word.split_at(i);
 
-        let left_freq = ctx.word_documents_count(left)?.unwrap_or(0);
-        let right_freq = ctx.word_documents_count(right)?.unwrap_or(0);
+        let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0);
 
-        let min_freq = cmp::min(left_freq, right_freq);
-        if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
-            best = Some((min_freq, left, right));
+        if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) {
+            best = Some((pair_freq, left, right));
         }
     }
 
@@ -824,6 +838,11 @@ mod test {
             Ok(self.postings.get(word).cloned())
         }
 
+        fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, _: u8) -> heed::Result<Option<RoaringBitmap>> {
+            let bitmap = self.postings.get(&format!("{} {}", left_word, right_word));
+            Ok(bitmap.cloned())
+        }
+
         fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
             let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
             Ok(self.synonyms.get(&words).cloned())
@@ -881,19 +900,22 @@ mod test {
                     ],
                 },
                 postings: hashmap! {
-                    String::from("hello")      => random_postings(rng,   1500),
-                    String::from("hi")         => random_postings(rng,   4000),
-                    String::from("word")       => random_postings(rng,   2500),
-                    String::from("split")      => random_postings(rng,    400),
-                    String::from("ngrams")     => random_postings(rng,   1400),
-                    String::from("world")      => random_postings(rng, 15_000),
-                    String::from("earth")      => random_postings(rng,   8000),
-                    String::from("2021")       => random_postings(rng,    100),
-                    String::from("2020")       => random_postings(rng,    500),
-                    String::from("is")         => random_postings(rng, 50_000),
-                    String::from("this")       => random_postings(rng, 50_000),
-                    String::from("good")       => random_postings(rng,   1250),
-                    String::from("morning")    => random_postings(rng,    125),
+                    String::from("hello")           => random_postings(rng,   1500),
+                    String::from("hi")              => random_postings(rng,   4000),
+                    String::from("word")            => random_postings(rng,   2500),
+                    String::from("split")           => random_postings(rng,    400),
+                    String::from("ngrams")          => random_postings(rng,   1400),
+                    String::from("world")           => random_postings(rng, 15_000),
+                    String::from("earth")           => random_postings(rng,   8000),
+                    String::from("2021")            => random_postings(rng,    100),
+                    String::from("2020")            => random_postings(rng,    500),
+                    String::from("is")              => random_postings(rng, 50_000),
+                    String::from("this")            => random_postings(rng, 50_000),
+                    String::from("good")            => random_postings(rng,   1250),
+                    String::from("morning")         => random_postings(rng,    125),
+                    String::from("word split")      => random_postings(rng,   5000),
+                    String::from("quick brownfox")  => random_postings(rng,   7000),
+                    String::from("quickbrown fox")  => random_postings(rng,   8000),
                 },
                 exact_words,
             }
@@ -1041,6 +1063,23 @@ mod test {
         "###);
     }
 
+    #[test]
+    fn word_split_choose_pair_with_max_freq() {
+        let query = "quickbrownfox";
+        let tokens = query.tokenize();
+
+        let (query_tree, _) = TestContext::default()
+            .build(TermsMatchingStrategy::All, true, None, tokens)
+            .unwrap()
+            .unwrap();
+
+        insta::assert_debug_snapshot!(query_tree, @r###"
+        OR
+          PHRASE ["quickbrown", "fox"]
+          PrefixTolerant { word: "quickbrownfox", max typo: 2 }
+        "###);
+    }
+
     #[test]
     fn phrase() {
         let query = "\"hey friends\" \" \" \"wooop";

From 63e79a9039feaa1caa117d2aea3275bbbd722ed0 Mon Sep 17 00:00:00 2001
From: Akshay Kulkarni <akshayk.gj@gmail.com>
Date: Wed, 12 Oct 2022 13:36:48 +0530
Subject: [PATCH 2/8] update comment

---
 milli/src/search/query_tree.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index b54fc5c9c..70227b8f9 100755
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -279,7 +279,7 @@ impl<'a> QueryTreeBuilder<'a> {
     }
 }
 
-/// Split the word depending on the frequency of subwords in the database documents.
+/// Split the word depending on the frequency of pairs near together in the database documents.
 fn split_best_frequency<'a>(
     ctx: &impl Context,
     word: &'a str,

From 8c9245149e32468e57183066536389567db318ca Mon Sep 17 00:00:00 2001
From: Akshay Kulkarni <akshayk.gj@gmail.com>
Date: Wed, 12 Oct 2022 15:27:56 +0530
Subject: [PATCH 3/8] format file

---
 milli/src/search/query_tree.rs | 40 +++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 70227b8f9..43d903d16 100755
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -146,7 +146,12 @@ impl fmt::Debug for Query {
 
 trait Context {
     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
-    fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
+    fn word_pair_proximity_docids(
+        &self,
+        right_word: &str,
+        left_word: &str,
+        proximity: u8,
+    ) -> heed::Result<Option<RoaringBitmap>>;
     fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
     fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
         match self.word_docids(word)? {
@@ -157,7 +162,12 @@ trait Context {
     /// Returns the minimum word len for 1 and 2 typos.
     fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
     fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
-    fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> {
+    fn word_pair_frequency(
+        &self,
+        left_word: &str,
+        right_word: &str,
+        proximity: u8,
+    ) -> heed::Result<Option<u64>> {
         match self.word_pair_proximity_docids(right_word, left_word, proximity)? {
             Some(rb) => Ok(Some(rb.len())),
             None => Ok(None),
@@ -180,7 +190,12 @@ impl<'a> Context for QueryTreeBuilder<'a> {
         self.index.word_docids.get(self.rtxn, word)
     }
 
-    fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
+    fn word_pair_proximity_docids(
+        &self,
+        right_word: &str,
+        left_word: &str,
+        proximity: u8,
+    ) -> heed::Result<Option<RoaringBitmap>> {
         self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity))
     }
 
@@ -202,9 +217,17 @@ impl<'a> Context for QueryTreeBuilder<'a> {
         self.exact_words.as_ref()
     }
 
-    fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> {
+    fn word_pair_frequency(
+        &self,
+        left_word: &str,
+        right_word: &str,
+        proximity: u8,
+    ) -> heed::Result<Option<u64>> {
         let key = (left_word, right_word, proximity);
-        self.index.word_pair_proximity_docids.remap_data_type::<CboRoaringBitmapLenCodec>().get(&self.rtxn, &key)
+        self.index
+            .word_pair_proximity_docids
+            .remap_data_type::<CboRoaringBitmapLenCodec>()
+            .get(&self.rtxn, &key)
     }
 }
 
@@ -838,7 +861,12 @@ mod test {
             Ok(self.postings.get(word).cloned())
         }
 
-        fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, _: u8) -> heed::Result<Option<RoaringBitmap>> {
+        fn word_pair_proximity_docids(
+            &self,
+            right_word: &str,
+            left_word: &str,
+            _: u8,
+        ) -> heed::Result<Option<RoaringBitmap>> {
             let bitmap = self.postings.get(&format!("{} {}", left_word, right_word));
             Ok(bitmap.cloned())
         }

From 6cb8b46900492dd23b229e33604263839a32f06e Mon Sep 17 00:00:00 2001
From: Akshay Kulkarni <akshayk.gj@gmail.com>
Date: Thu, 13 Oct 2022 12:43:11 +0530
Subject: [PATCH 4/8] use word_pair_frequency and remove word_documents_count

---
 milli/src/search/query_tree.rs | 50 ++++++----------------------------
 1 file changed, 8 insertions(+), 42 deletions(-)

diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 43d903d16..4ed1e9fbd 100755
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -146,19 +146,7 @@ impl fmt::Debug for Query {
 
 trait Context {
     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
-    fn word_pair_proximity_docids(
-        &self,
-        right_word: &str,
-        left_word: &str,
-        proximity: u8,
-    ) -> heed::Result<Option<RoaringBitmap>>;
     fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
-    fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
-        match self.word_docids(word)? {
-            Some(rb) => Ok(Some(rb.len())),
-            None => Ok(None),
-        }
-    }
     /// Returns the minimum word len for 1 and 2 typos.
     fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
     fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
@@ -166,9 +154,9 @@ trait Context {
         &self,
         left_word: &str,
         right_word: &str,
-        proximity: u8,
+        _proximity: u8,
     ) -> heed::Result<Option<u64>> {
-        match self.word_pair_proximity_docids(right_word, left_word, proximity)? {
+        match self.word_docids(&format!("{} {}", left_word, right_word))? {
             Some(rb) => Ok(Some(rb.len())),
             None => Ok(None),
         }
@@ -190,23 +178,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
         self.index.word_docids.get(self.rtxn, word)
     }
 
-    fn word_pair_proximity_docids(
-        &self,
-        right_word: &str,
-        left_word: &str,
-        proximity: u8,
-    ) -> heed::Result<Option<RoaringBitmap>> {
-        self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity))
-    }
-
     fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
         self.index.words_synonyms(self.rtxn, words)
     }
 
-    fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
-        self.index.word_documents_count(self.rtxn, word)
-    }
-
     fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
         let one = self.index.min_word_len_one_typo(&self.rtxn)?;
         let two = self.index.min_word_len_two_typos(&self.rtxn)?;
@@ -306,7 +281,7 @@ impl<'a> QueryTreeBuilder<'a> {
 fn split_best_frequency<'a>(
     ctx: &impl Context,
     word: &'a str,
-) -> heed::Result<Option<(&'a str, &'a str)>> {
+) -> heed::Result<Option<(u64, &'a str, &'a str)>> {
     let chars = word.char_indices().skip(1);
     let mut best = None;
 
@@ -320,7 +295,7 @@ fn split_best_frequency<'a>(
         }
     }
 
-    Ok(best.map(|(_, left, right)| (left, right)))
+    Ok(best)
 }
 
 #[derive(Clone)]
@@ -389,7 +364,7 @@ fn create_query_tree(
             // 4. wrap all in an OR operation
             PrimitiveQueryPart::Word(word, prefix) => {
                 let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
-                if let Some((left, right)) = split_best_frequency(ctx, &word)? {
+                if let Some((_, left, right)) = split_best_frequency(ctx, &word)? {
                     children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
                 }
                 let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
@@ -535,7 +510,8 @@ fn create_query_tree(
                 .filter(|(_, part)| !part.is_phrase())
                 .max_by_key(|(_, part)| match part {
                     PrimitiveQueryPart::Word(s, _) => {
-                        ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value())
+                        let (pair_freq, _, _) = split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default();
+                        pair_freq
                     }
                     _ => unreachable!(),
                 })
@@ -582,7 +558,7 @@ fn create_matching_words(
                     }
                 }
 
-                if let Some((left, right)) = split_best_frequency(ctx, &word)? {
+                if let Some((_, left, right)) = split_best_frequency(ctx, &word)? {
                     let left = MatchingWord::new(left.to_string(), 0, false);
                     let right = MatchingWord::new(right.to_string(), 0, false);
                     matching_words.push((vec![left, right], vec![id]));
@@ -861,16 +837,6 @@ mod test {
             Ok(self.postings.get(word).cloned())
         }
 
-        fn word_pair_proximity_docids(
-            &self,
-            right_word: &str,
-            left_word: &str,
-            _: u8,
-        ) -> heed::Result<Option<RoaringBitmap>> {
-            let bitmap = self.postings.get(&format!("{} {}", left_word, right_word));
-            Ok(bitmap.cloned())
-        }
-
         fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
             let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
             Ok(self.synonyms.get(&words).cloned())

From ff8b2d4422f8b5363878dc065738935ac080ff42 Mon Sep 17 00:00:00 2001
From: Akshay Kulkarni <akshayk.gj@gmail.com>
Date: Thu, 13 Oct 2022 12:44:08 +0530
Subject: [PATCH 5/8] formatting

---
 milli/src/search/query_tree.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 4ed1e9fbd..723192d20 100755
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -510,7 +510,8 @@ fn create_query_tree(
                 .filter(|(_, part)| !part.is_phrase())
                 .max_by_key(|(_, part)| match part {
                     PrimitiveQueryPart::Word(s, _) => {
-                        let (pair_freq, _, _) = split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default();
+                        let (pair_freq, _, _) =
+                            split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default();
                         pair_freq
                     }
                     _ => unreachable!(),

From 32f825d442f3b1ade6c6fe70c1f161db267f3abb Mon Sep 17 00:00:00 2001
From: Akshay Kulkarni <akshayk.gj@gmail.com>
Date: Thu, 13 Oct 2022 12:57:50 +0530
Subject: [PATCH 6/8] move default implementation of word_pair_frequency to
 TestContext

---
 milli/src/search/query_tree.rs | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 723192d20..8ab5f81a4 100755
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -155,12 +155,7 @@ trait Context {
         left_word: &str,
         right_word: &str,
         _proximity: u8,
-    ) -> heed::Result<Option<u64>> {
-        match self.word_docids(&format!("{} {}", left_word, right_word))? {
-            Some(rb) => Ok(Some(rb.len())),
-            None => Ok(None),
-        }
-    }
+    ) -> heed::Result<Option<u64>>;
 }
 
 /// The query tree builder is the interface to build a query tree.
@@ -850,6 +845,18 @@ mod test {
         fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
             self.exact_words.as_ref()
         }
+
+        fn word_pair_frequency(
+            &self,
+            left_word: &str,
+            right_word: &str,
+            _proximity: u8,
+        ) -> heed::Result<Option<u64>> {
+            match self.word_docids(&format!("{} {}", left_word, right_word))? {
+                Some(rb) => Ok(Some(rb.len())),
+                None => Ok(None),
+            }
+        }
     }
 
     impl Default for TestContext {

From 8195fc6141fb8da0a63b6a12b92f2132ba1c8640 Mon Sep 17 00:00:00 2001
From: Akshay Kulkarni <akshayk.gj@gmail.com>
Date: Thu, 13 Oct 2022 13:14:27 +0530
Subject: [PATCH 7/8] revert removal of word_documents_count method

---
 milli/src/search/query_tree.rs | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 8ab5f81a4..94c28b3f9 100755
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -147,6 +147,12 @@ impl fmt::Debug for Query {
 trait Context {
     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
     fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
+    fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
+        match self.word_docids(word)? {
+            Some(rb) => Ok(Some(rb.len())),
+            None => Ok(None),
+        }
+    }
     /// Returns the minimum word len for 1 and 2 typos.
     fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
     fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
@@ -276,7 +282,7 @@ impl<'a> QueryTreeBuilder<'a> {
 fn split_best_frequency<'a>(
     ctx: &impl Context,
     word: &'a str,
-) -> heed::Result<Option<(u64, &'a str, &'a str)>> {
+) -> heed::Result<Option<(&'a str, &'a str)>> {
     let chars = word.char_indices().skip(1);
     let mut best = None;
 
@@ -290,7 +296,7 @@ fn split_best_frequency<'a>(
         }
     }
 
-    Ok(best)
+    Ok(best.map(|(_, left, right)| (left, right)))
 }
 
 #[derive(Clone)]
@@ -359,7 +365,7 @@ fn create_query_tree(
             // 4. wrap all in an OR operation
             PrimitiveQueryPart::Word(word, prefix) => {
                 let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
-                if let Some((_, left, right)) = split_best_frequency(ctx, &word)? {
+                if let Some((left, right)) = split_best_frequency(ctx, &word)? {
                     children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
                 }
                 let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
@@ -505,9 +511,7 @@ fn create_query_tree(
                 .filter(|(_, part)| !part.is_phrase())
                 .max_by_key(|(_, part)| match part {
                     PrimitiveQueryPart::Word(s, _) => {
-                        let (pair_freq, _, _) =
-                            split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default();
-                        pair_freq
+                        ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value());
                     }
                     _ => unreachable!(),
                 })
@@ -554,7 +558,7 @@ fn create_matching_words(
                     }
                 }
 
-                if let Some((_, left, right)) = split_best_frequency(ctx, &word)? {
+                if let Some((left, right)) = split_best_frequency(ctx, &word)? {
                     let left = MatchingWord::new(left.to_string(), 0, false);
                     let right = MatchingWord::new(right.to_string(), 0, false);
                     matching_words.push((vec![left, right], vec![id]));

From 85f30283173968628d343ac6b69f120c343ae99b Mon Sep 17 00:00:00 2001
From: Akshay Kulkarni <akshayk.gj@gmail.com>
Date: Thu, 13 Oct 2022 13:21:59 +0530
Subject: [PATCH 8/8] remove underscore and introduce back word_documents_count

---
 milli/src/search/query_tree.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 94c28b3f9..080f89080 100755
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -160,7 +160,7 @@ trait Context {
         &self,
         left_word: &str,
         right_word: &str,
-        _proximity: u8,
+        proximity: u8,
     ) -> heed::Result<Option<u64>>;
 }
 
@@ -183,6 +183,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
         self.index.words_synonyms(self.rtxn, words)
     }
 
+    fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
+        self.index.word_documents_count(self.rtxn, word)
+    }
+
     fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
         let one = self.index.min_word_len_one_typo(&self.rtxn)?;
         let two = self.index.min_word_len_two_typos(&self.rtxn)?;
@@ -511,7 +515,7 @@ fn create_query_tree(
                 .filter(|(_, part)| !part.is_phrase())
                 .max_by_key(|(_, part)| match part {
                     PrimitiveQueryPart::Word(s, _) => {
-                        ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value());
+                        ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value())
                     }
                     _ => unreachable!(),
                 })