mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-19 01:18:31 +08:00
use word_pair_frequency and remove word_documents_count
This commit is contained in:
parent
8c9245149e
commit
6cb8b46900
@ -146,19 +146,7 @@ impl fmt::Debug for Query {
|
|||||||
|
|
||||||
trait Context {
|
trait Context {
|
||||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
fn word_pair_proximity_docids(
|
|
||||||
&self,
|
|
||||||
right_word: &str,
|
|
||||||
left_word: &str,
|
|
||||||
proximity: u8,
|
|
||||||
) -> heed::Result<Option<RoaringBitmap>>;
|
|
||||||
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
||||||
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
|
||||||
match self.word_docids(word)? {
|
|
||||||
Some(rb) => Ok(Some(rb.len())),
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/// Returns the minimum word len for 1 and 2 typos.
|
/// Returns the minimum word len for 1 and 2 typos.
|
||||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
|
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
|
||||||
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
|
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
|
||||||
@ -166,9 +154,9 @@ trait Context {
|
|||||||
&self,
|
&self,
|
||||||
left_word: &str,
|
left_word: &str,
|
||||||
right_word: &str,
|
right_word: &str,
|
||||||
proximity: u8,
|
_proximity: u8,
|
||||||
) -> heed::Result<Option<u64>> {
|
) -> heed::Result<Option<u64>> {
|
||||||
match self.word_pair_proximity_docids(right_word, left_word, proximity)? {
|
match self.word_docids(&format!("{} {}", left_word, right_word))? {
|
||||||
Some(rb) => Ok(Some(rb.len())),
|
Some(rb) => Ok(Some(rb.len())),
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
@ -190,23 +178,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
|||||||
self.index.word_docids.get(self.rtxn, word)
|
self.index.word_docids.get(self.rtxn, word)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn word_pair_proximity_docids(
|
|
||||||
&self,
|
|
||||||
right_word: &str,
|
|
||||||
left_word: &str,
|
|
||||||
proximity: u8,
|
|
||||||
) -> heed::Result<Option<RoaringBitmap>> {
|
|
||||||
self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
||||||
self.index.words_synonyms(self.rtxn, words)
|
self.index.words_synonyms(self.rtxn, words)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
|
||||||
self.index.word_documents_count(self.rtxn, word)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
|
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
|
||||||
let one = self.index.min_word_len_one_typo(&self.rtxn)?;
|
let one = self.index.min_word_len_one_typo(&self.rtxn)?;
|
||||||
let two = self.index.min_word_len_two_typos(&self.rtxn)?;
|
let two = self.index.min_word_len_two_typos(&self.rtxn)?;
|
||||||
@ -306,7 +281,7 @@ impl<'a> QueryTreeBuilder<'a> {
|
|||||||
fn split_best_frequency<'a>(
|
fn split_best_frequency<'a>(
|
||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
word: &'a str,
|
word: &'a str,
|
||||||
) -> heed::Result<Option<(&'a str, &'a str)>> {
|
) -> heed::Result<Option<(u64, &'a str, &'a str)>> {
|
||||||
let chars = word.char_indices().skip(1);
|
let chars = word.char_indices().skip(1);
|
||||||
let mut best = None;
|
let mut best = None;
|
||||||
|
|
||||||
@ -320,7 +295,7 @@ fn split_best_frequency<'a>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(best.map(|(_, left, right)| (left, right)))
|
Ok(best)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@ -389,7 +364,7 @@ fn create_query_tree(
|
|||||||
// 4. wrap all in an OR operation
|
// 4. wrap all in an OR operation
|
||||||
PrimitiveQueryPart::Word(word, prefix) => {
|
PrimitiveQueryPart::Word(word, prefix) => {
|
||||||
let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
|
let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
|
||||||
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
|
if let Some((_, left, right)) = split_best_frequency(ctx, &word)? {
|
||||||
children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
|
children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
|
||||||
}
|
}
|
||||||
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
||||||
@ -535,7 +510,8 @@ fn create_query_tree(
|
|||||||
.filter(|(_, part)| !part.is_phrase())
|
.filter(|(_, part)| !part.is_phrase())
|
||||||
.max_by_key(|(_, part)| match part {
|
.max_by_key(|(_, part)| match part {
|
||||||
PrimitiveQueryPart::Word(s, _) => {
|
PrimitiveQueryPart::Word(s, _) => {
|
||||||
ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value())
|
let (pair_freq, _, _) = split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default();
|
||||||
|
pair_freq
|
||||||
}
|
}
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
})
|
})
|
||||||
@ -582,7 +558,7 @@ fn create_matching_words(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
|
if let Some((_, left, right)) = split_best_frequency(ctx, &word)? {
|
||||||
let left = MatchingWord::new(left.to_string(), 0, false);
|
let left = MatchingWord::new(left.to_string(), 0, false);
|
||||||
let right = MatchingWord::new(right.to_string(), 0, false);
|
let right = MatchingWord::new(right.to_string(), 0, false);
|
||||||
matching_words.push((vec![left, right], vec![id]));
|
matching_words.push((vec![left, right], vec![id]));
|
||||||
@ -861,16 +837,6 @@ mod test {
|
|||||||
Ok(self.postings.get(word).cloned())
|
Ok(self.postings.get(word).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn word_pair_proximity_docids(
|
|
||||||
&self,
|
|
||||||
right_word: &str,
|
|
||||||
left_word: &str,
|
|
||||||
_: u8,
|
|
||||||
) -> heed::Result<Option<RoaringBitmap>> {
|
|
||||||
let bitmap = self.postings.get(&format!("{} {}", left_word, right_word));
|
|
||||||
Ok(bitmap.cloned())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
||||||
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
||||||
Ok(self.synonyms.get(&words).cloned())
|
Ok(self.synonyms.get(&words).cloned())
|
||||||
|
Loading…
Reference in New Issue
Block a user