revert removal of word_documents_count method

This commit is contained in:
Akshay Kulkarni 2022-10-13 13:14:27 +05:30
parent 32f825d442
commit 8195fc6141
No known key found for this signature in database
GPG Key ID: CAB2F21265743909

View File

@ -147,6 +147,12 @@ impl fmt::Debug for Query {
trait Context { trait Context {
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>; fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
match self.word_docids(word)? {
Some(rb) => Ok(Some(rb.len())),
None => Ok(None),
}
}
/// Returns the minimum word len for 1 and 2 typos. /// Returns the minimum word len for 1 and 2 typos.
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>; fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
@ -276,7 +282,7 @@ impl<'a> QueryTreeBuilder<'a> {
fn split_best_frequency<'a>( fn split_best_frequency<'a>(
ctx: &impl Context, ctx: &impl Context,
word: &'a str, word: &'a str,
) -> heed::Result<Option<(u64, &'a str, &'a str)>> { ) -> heed::Result<Option<(&'a str, &'a str)>> {
let chars = word.char_indices().skip(1); let chars = word.char_indices().skip(1);
let mut best = None; let mut best = None;
@ -290,7 +296,7 @@ fn split_best_frequency<'a>(
} }
} }
Ok(best) Ok(best.map(|(_, left, right)| (left, right)))
} }
#[derive(Clone)] #[derive(Clone)]
@ -359,7 +365,7 @@ fn create_query_tree(
// 4. wrap all in an OR operation // 4. wrap all in an OR operation
PrimitiveQueryPart::Word(word, prefix) => { PrimitiveQueryPart::Word(word, prefix) => {
let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { if let Some((left, right)) = split_best_frequency(ctx, &word)? {
children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
} }
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
@ -505,9 +511,7 @@ fn create_query_tree(
.filter(|(_, part)| !part.is_phrase()) .filter(|(_, part)| !part.is_phrase())
.max_by_key(|(_, part)| match part { .max_by_key(|(_, part)| match part {
PrimitiveQueryPart::Word(s, _) => { PrimitiveQueryPart::Word(s, _) => {
let (pair_freq, _, _) = ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value());
split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default();
pair_freq
} }
_ => unreachable!(), _ => unreachable!(),
}) })
@ -554,7 +558,7 @@ fn create_matching_words(
} }
} }
if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { if let Some((left, right)) = split_best_frequency(ctx, &word)? {
let left = MatchingWord::new(left.to_string(), 0, false); let left = MatchingWord::new(left.to_string(), 0, false);
let right = MatchingWord::new(right.to_string(), 0, false); let right = MatchingWord::new(right.to_string(), 0, false);
matching_words.push((vec![left, right], vec![id])); matching_words.push((vec![left, right], vec![id]));