From 7f9680f0a0e2c5113ec29500066af2d47c3275f8 Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Wed, 12 Oct 2022 13:18:23 +0530 Subject: [PATCH 1/8] Enhance word splitting strategy --- milli/src/search/query_tree.rs | 79 +++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 20 deletions(-) mode change 100644 => 100755 milli/src/search/query_tree.rs diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs old mode 100644 new mode 100755 index 1c60e41f7..b54fc5c9c --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; use std::cmp::max; -use std::{cmp, fmt, mem}; +use std::{fmt, mem}; use charabia::classifier::ClassifiedTokenIter; use charabia::{SeparatorKind, TokenKind}; @@ -10,7 +10,7 @@ use slice_group_by::GroupBy; use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; use crate::search::TermsMatchingStrategy; -use crate::{Index, MatchingWords, Result}; +use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result}; type IsOptionalWord = bool; type IsPrefix = bool; @@ -146,6 +146,7 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; + fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result>; fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { @@ -156,6 +157,12 @@ trait Context { /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; fn exact_words(&self) -> Option<&fst::Set>>; + fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result> { + match self.word_pair_proximity_docids(right_word, left_word, proximity)? { + Some(rb) => Ok(Some(rb.len())), + None => Ok(None), + } + } } /// The query tree builder is the interface to build a query tree. @@ -173,6 +180,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } + fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result> { + self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity)) + } + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { self.index.words_synonyms(self.rtxn, words) } @@ -190,6 +201,11 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn exact_words(&self) -> Option<&fst::Set>> { self.exact_words.as_ref() } + + fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result> { + let key = (left_word, right_word, proximity); + self.index.word_pair_proximity_docids.remap_data_type::().get(&self.rtxn, &key) + } } impl<'a> QueryTreeBuilder<'a> { @@ -274,12 +290,10 @@ fn split_best_frequency<'a>( for (i, _) in chars { let (left, right) = word.split_at(i); - let left_freq = ctx.word_documents_count(left)?.unwrap_or(0); - let right_freq = ctx.word_documents_count(right)?.unwrap_or(0); + let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0); - let min_freq = cmp::min(left_freq, right_freq); - if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { - best = Some((min_freq, left, right)); + if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) { + best = Some((pair_freq, left, right)); } } @@ -824,6 +838,11 @@ mod test { Ok(self.postings.get(word).cloned()) } + fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, _: u8) -> heed::Result> { + let bitmap = self.postings.get(&format!("{} {}", left_word, right_word)); + Ok(bitmap.cloned()) + } + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) @@ -881,19 +900,22 @@ mod test { ], }, postings: hashmap! { - String::from("hello") => random_postings(rng, 1500), - String::from("hi") => random_postings(rng, 4000), - String::from("word") => random_postings(rng, 2500), - String::from("split") => random_postings(rng, 400), - String::from("ngrams") => random_postings(rng, 1400), - String::from("world") => random_postings(rng, 15_000), - String::from("earth") => random_postings(rng, 8000), - String::from("2021") => random_postings(rng, 100), - String::from("2020") => random_postings(rng, 500), - String::from("is") => random_postings(rng, 50_000), - String::from("this") => random_postings(rng, 50_000), - String::from("good") => random_postings(rng, 1250), - String::from("morning") => random_postings(rng, 125), + String::from("hello") => random_postings(rng, 1500), + String::from("hi") => random_postings(rng, 4000), + String::from("word") => random_postings(rng, 2500), + String::from("split") => random_postings(rng, 400), + String::from("ngrams") => random_postings(rng, 1400), + String::from("world") => random_postings(rng, 15_000), + String::from("earth") => random_postings(rng, 8000), + String::from("2021") => random_postings(rng, 100), + String::from("2020") => random_postings(rng, 500), + String::from("is") => random_postings(rng, 50_000), + String::from("this") => random_postings(rng, 50_000), + String::from("good") => random_postings(rng, 1250), + String::from("morning") => random_postings(rng, 125), + String::from("word split") => random_postings(rng, 5000), + String::from("quick brownfox") => random_postings(rng, 7000), + String::from("quickbrown fox") => random_postings(rng, 8000), }, exact_words, } @@ -1041,6 +1063,23 @@ mod test { "###); } + #[test] + fn word_split_choose_pair_with_max_freq() { + let query = "quickbrownfox"; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + PHRASE ["quickbrown", "fox"] + PrefixTolerant { word: "quickbrownfox", max typo: 2 } + "###); + } + #[test] fn phrase() { let query = "\"hey friends\" \" \" \"wooop"; From 63e79a9039feaa1caa117d2aea3275bbbd722ed0 Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Wed, 12 Oct 2022 13:36:48 +0530 Subject: [PATCH 2/8] update comment --- milli/src/search/query_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index b54fc5c9c..70227b8f9 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -279,7 +279,7 @@ impl<'a> QueryTreeBuilder<'a> { } } -/// Split the word depending on the frequency of subwords in the database documents. +/// Split the word depending on the frequency of pairs near together in the database documents. fn split_best_frequency<'a>( ctx: &impl Context, word: &'a str, From 8c9245149e32468e57183066536389567db318ca Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Wed, 12 Oct 2022 15:27:56 +0530 Subject: [PATCH 3/8] format file --- milli/src/search/query_tree.rs | 40 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 70227b8f9..43d903d16 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -146,7 +146,12 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result>; + fn word_pair_proximity_docids( + &self, + right_word: &str, + left_word: &str, + proximity: u8, + ) -> heed::Result>; fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { @@ -157,7 +162,12 @@ trait Context { /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; fn exact_words(&self) -> Option<&fst::Set>>; - fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result> { + fn word_pair_frequency( + &self, + left_word: &str, + right_word: &str, + proximity: u8, + ) -> heed::Result> { match self.word_pair_proximity_docids(right_word, left_word, proximity)? { Some(rb) => Ok(Some(rb.len())), None => Ok(None), @@ -180,7 +190,12 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } - fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result> { + fn word_pair_proximity_docids( + &self, + right_word: &str, + left_word: &str, + proximity: u8, + ) -> heed::Result> { self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity)) } @@ -202,9 +217,17 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.exact_words.as_ref() } - fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result> { + fn word_pair_frequency( + &self, + left_word: &str, + right_word: &str, + proximity: u8, + ) -> heed::Result> { let key = (left_word, right_word, proximity); - self.index.word_pair_proximity_docids.remap_data_type::().get(&self.rtxn, &key) + self.index + .word_pair_proximity_docids + .remap_data_type::() + .get(&self.rtxn, &key) } } @@ -838,7 +861,12 @@ mod test { Ok(self.postings.get(word).cloned()) } - fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, _: u8) -> heed::Result> { + fn word_pair_proximity_docids( + &self, + right_word: &str, + left_word: &str, + _: u8, + ) -> heed::Result> { let bitmap = self.postings.get(&format!("{} {}", left_word, right_word)); Ok(bitmap.cloned()) } From 6cb8b46900492dd23b229e33604263839a32f06e Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 12:43:11 +0530 Subject: [PATCH 4/8] use word_pair_frequency and remove word_documents_count --- milli/src/search/query_tree.rs | 50 ++++++---------------------------- 1 file changed, 8 insertions(+), 42 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 43d903d16..4ed1e9fbd 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -146,19 +146,7 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn word_pair_proximity_docids( - &self, - right_word: &str, - left_word: &str, - proximity: u8, - ) -> heed::Result>; fn synonyms>(&self, words: &[S]) -> heed::Result>>>; - fn word_documents_count(&self, word: &str) -> heed::Result> { - match self.word_docids(word)? { - Some(rb) => Ok(Some(rb.len())), - None => Ok(None), - } - } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; fn exact_words(&self) -> Option<&fst::Set>>; @@ -166,9 +154,9 @@ trait Context { &self, left_word: &str, right_word: &str, - proximity: u8, + _proximity: u8, ) -> heed::Result> { - match self.word_pair_proximity_docids(right_word, left_word, proximity)? { + match self.word_docids(&format!("{} {}", left_word, right_word))? { Some(rb) => Ok(Some(rb.len())), None => Ok(None), } @@ -190,23 +178,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } - fn word_pair_proximity_docids( - &self, - right_word: &str, - left_word: &str, - proximity: u8, - ) -> heed::Result> { - self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity)) - } - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { self.index.words_synonyms(self.rtxn, words) } - fn word_documents_count(&self, word: &str) -> heed::Result> { - self.index.word_documents_count(self.rtxn, word) - } - fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { let one = self.index.min_word_len_one_typo(&self.rtxn)?; let two = self.index.min_word_len_two_typos(&self.rtxn)?; @@ -306,7 +281,7 @@ impl<'a> QueryTreeBuilder<'a> { fn split_best_frequency<'a>( ctx: &impl Context, word: &'a str, -) -> heed::Result> { +) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -320,7 +295,7 @@ fn split_best_frequency<'a>( } } - Ok(best.map(|(_, left, right)| (left, right))) + Ok(best) } #[derive(Clone)] @@ -389,7 +364,7 @@ fn create_query_tree( // 4. wrap all in an OR operation PrimitiveQueryPart::Word(word, prefix) => { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some((left, right)) = split_best_frequency(ctx, &word)? { + if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; @@ -535,7 +510,8 @@ fn create_query_tree( .filter(|(_, part)| !part.is_phrase()) .max_by_key(|(_, part)| match part { PrimitiveQueryPart::Word(s, _) => { - ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()) + let (pair_freq, _, _) = split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default(); + pair_freq } _ => unreachable!(), }) @@ -582,7 +558,7 @@ fn create_matching_words( } } - if let Some((left, right)) = split_best_frequency(ctx, &word)? { + if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { let left = MatchingWord::new(left.to_string(), 0, false); let right = MatchingWord::new(right.to_string(), 0, false); matching_words.push((vec![left, right], vec![id])); @@ -861,16 +837,6 @@ mod test { Ok(self.postings.get(word).cloned()) } - fn word_pair_proximity_docids( - &self, - right_word: &str, - left_word: &str, - _: u8, - ) -> heed::Result> { - let bitmap = self.postings.get(&format!("{} {}", left_word, right_word)); - Ok(bitmap.cloned()) - } - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) From ff8b2d4422f8b5363878dc065738935ac080ff42 Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 12:44:08 +0530 Subject: [PATCH 5/8] formatting --- milli/src/search/query_tree.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4ed1e9fbd..723192d20 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -510,7 +510,8 @@ fn create_query_tree( .filter(|(_, part)| !part.is_phrase()) .max_by_key(|(_, part)| match part { PrimitiveQueryPart::Word(s, _) => { - let (pair_freq, _, _) = split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default(); + let (pair_freq, _, _) = + split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default(); pair_freq } _ => unreachable!(), From 32f825d442f3b1ade6c6fe70c1f161db267f3abb Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 12:57:50 +0530 Subject: [PATCH 6/8] move default implementation of word_pair_frequency to TestContext --- milli/src/search/query_tree.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 723192d20..8ab5f81a4 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -155,12 +155,7 @@ trait Context { left_word: &str, right_word: &str, _proximity: u8, - ) -> heed::Result> { - match self.word_docids(&format!("{} {}", left_word, right_word))? { - Some(rb) => Ok(Some(rb.len())), - None => Ok(None), - } - } + ) -> heed::Result>; } /// The query tree builder is the interface to build a query tree. @@ -850,6 +845,18 @@ mod test { fn exact_words(&self) -> Option<&fst::Set>> { self.exact_words.as_ref() } + + fn word_pair_frequency( + &self, + left_word: &str, + right_word: &str, + _proximity: u8, + ) -> heed::Result> { + match self.word_docids(&format!("{} {}", left_word, right_word))? { + Some(rb) => Ok(Some(rb.len())), + None => Ok(None), + } + } } impl Default for TestContext { From 8195fc6141fb8da0a63b6a12b92f2132ba1c8640 Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 13:14:27 +0530 Subject: [PATCH 7/8] revert removal of word_documents_count method --- milli/src/search/query_tree.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 8ab5f81a4..94c28b3f9 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -147,6 +147,12 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; fn synonyms>(&self, words: &[S]) -> heed::Result>>>; + fn word_documents_count(&self, word: &str) -> heed::Result> { + match self.word_docids(word)? { + Some(rb) => Ok(Some(rb.len())), + None => Ok(None), + } + } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; fn exact_words(&self) -> Option<&fst::Set>>; @@ -276,7 +282,7 @@ impl<'a> QueryTreeBuilder<'a> { fn split_best_frequency<'a>( ctx: &impl Context, word: &'a str, -) -> heed::Result> { +) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -290,7 +296,7 @@ fn split_best_frequency<'a>( } } - Ok(best) + Ok(best.map(|(_, left, right)| (left, right))) } #[derive(Clone)] @@ -359,7 +365,7 @@ fn create_query_tree( // 4. wrap all in an OR operation PrimitiveQueryPart::Word(word, prefix) => { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { + if let Some((left, right)) = split_best_frequency(ctx, &word)? { children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; @@ -505,9 +511,7 @@ fn create_query_tree( .filter(|(_, part)| !part.is_phrase()) .max_by_key(|(_, part)| match part { PrimitiveQueryPart::Word(s, _) => { - let (pair_freq, _, _) = - split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default(); - pair_freq + ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()); } _ => unreachable!(), }) @@ -554,7 +558,7 @@ fn create_matching_words( } } - if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { + if let Some((left, right)) = split_best_frequency(ctx, &word)? { let left = MatchingWord::new(left.to_string(), 0, false); let right = MatchingWord::new(right.to_string(), 0, false); matching_words.push((vec![left, right], vec![id])); From 85f30283173968628d343ac6b69f120c343ae99b Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 13:21:59 +0530 Subject: [PATCH 8/8] remove underscore and introduce back word_documents_count --- milli/src/search/query_tree.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 94c28b3f9..080f89080 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -160,7 +160,7 @@ trait Context { &self, left_word: &str, right_word: &str, - _proximity: u8, + proximity: u8, ) -> heed::Result>; } @@ -183,6 +183,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.words_synonyms(self.rtxn, words) } + fn word_documents_count(&self, word: &str) -> heed::Result> { + self.index.word_documents_count(self.rtxn, word) + } + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { let one = self.index.min_word_len_one_typo(&self.rtxn)?; let two = self.index.min_word_len_two_typos(&self.rtxn)?; @@ -511,7 +515,7 @@ fn create_query_tree( .filter(|(_, part)| !part.is_phrase()) .max_by_key(|(_, part)| match part { PrimitiveQueryPart::Word(s, _) => { - ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()); + ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()) } _ => unreachable!(), })