From 6cb8b46900492dd23b229e33604263839a32f06e Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 12:43:11 +0530 Subject: [PATCH] use word_pair_frequency and remove word_documents_count --- milli/src/search/query_tree.rs | 50 ++++++---------------------------- 1 file changed, 8 insertions(+), 42 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 43d903d16..4ed1e9fbd 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -146,19 +146,7 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn word_pair_proximity_docids( - &self, - right_word: &str, - left_word: &str, - proximity: u8, - ) -> heed::Result>; fn synonyms>(&self, words: &[S]) -> heed::Result>>>; - fn word_documents_count(&self, word: &str) -> heed::Result> { - match self.word_docids(word)? { - Some(rb) => Ok(Some(rb.len())), - None => Ok(None), - } - } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; fn exact_words(&self) -> Option<&fst::Set>>; @@ -166,9 +154,9 @@ trait Context { &self, left_word: &str, right_word: &str, - proximity: u8, + _proximity: u8, ) -> heed::Result> { - match self.word_pair_proximity_docids(right_word, left_word, proximity)? { + match self.word_docids(&format!("{} {}", left_word, right_word))? { Some(rb) => Ok(Some(rb.len())), None => Ok(None), } @@ -190,23 +178,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } - fn word_pair_proximity_docids( - &self, - right_word: &str, - left_word: &str, - proximity: u8, - ) -> heed::Result> { - self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity)) - } - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { self.index.words_synonyms(self.rtxn, words) } - fn word_documents_count(&self, word: &str) -> heed::Result> { - self.index.word_documents_count(self.rtxn, word) - } - fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { let one = self.index.min_word_len_one_typo(&self.rtxn)?; let two = self.index.min_word_len_two_typos(&self.rtxn)?; @@ -306,7 +281,7 @@ impl<'a> QueryTreeBuilder<'a> { fn split_best_frequency<'a>( ctx: &impl Context, word: &'a str, -) -> heed::Result> { +) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -320,7 +295,7 @@ fn split_best_frequency<'a>( } } - Ok(best.map(|(_, left, right)| (left, right))) + Ok(best) } #[derive(Clone)] @@ -389,7 +364,7 @@ fn create_query_tree( // 4. wrap all in an OR operation PrimitiveQueryPart::Word(word, prefix) => { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some((left, right)) = split_best_frequency(ctx, &word)? { + if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; @@ -535,7 +510,8 @@ fn create_query_tree( .filter(|(_, part)| !part.is_phrase()) .max_by_key(|(_, part)| match part { PrimitiveQueryPart::Word(s, _) => { - ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()) + let (pair_freq, _, _) = split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default(); + pair_freq } _ => unreachable!(), }) @@ -582,7 +558,7 @@ fn create_matching_words( } } - if let Some((left, right)) = split_best_frequency(ctx, &word)? { + if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { let left = MatchingWord::new(left.to_string(), 0, false); let right = MatchingWord::new(right.to_string(), 0, false); matching_words.push((vec![left, right], vec![id])); @@ -861,16 +837,6 @@ mod test { Ok(self.postings.get(word).cloned()) } - fn word_pair_proximity_docids( - &self, - right_word: &str, - left_word: &str, - _: u8, - ) -> heed::Result> { - let bitmap = self.postings.get(&format!("{} {}", left_word, right_word)); - Ok(bitmap.cloned()) - } - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned())