mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 10:37:41 +08:00
Simplify query_term module a bit
This commit is contained in:
parent
3f13608002
commit
4129d657e2
@ -1,3 +1,4 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
use std::hash::Hash;
|
use std::hash::Hash;
|
||||||
|
|
||||||
@ -24,6 +25,8 @@ pub struct DatabaseCache<'ctx> {
|
|||||||
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
|
|
||||||
|
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
||||||
}
|
}
|
||||||
impl<'ctx> DatabaseCache<'ctx> {
|
impl<'ctx> DatabaseCache<'ctx> {
|
||||||
fn get_value<'v, K1, KC>(
|
fn get_value<'v, K1, KC>(
|
||||||
@ -49,6 +52,16 @@ impl<'ctx> DatabaseCache<'ctx> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl<'ctx> SearchContext<'ctx> {
|
impl<'ctx> SearchContext<'ctx> {
|
||||||
|
pub fn get_words_fst(&mut self) -> Result<fst::Set<Cow<'ctx, [u8]>>> {
|
||||||
|
if let Some(fst) = self.db_cache.words_fst.clone() {
|
||||||
|
Ok(fst)
|
||||||
|
} else {
|
||||||
|
let fst = self.index.words_fst(self.txn)?;
|
||||||
|
self.db_cache.words_fst = Some(fst.clone());
|
||||||
|
Ok(fst)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Retrieve or insert the given value in the `word_docids` database.
|
/// Retrieve or insert the given value in the `word_docids` database.
|
||||||
pub fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'ctx [u8]>> {
|
pub fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'ctx [u8]>> {
|
||||||
DatabaseCache::get_value(
|
DatabaseCache::get_value(
|
||||||
|
@ -7,14 +7,14 @@ use charabia::{SeparatorKind, TokenKind};
|
|||||||
use fst::automaton::Str;
|
use fst::automaton::Str;
|
||||||
use fst::{Automaton, IntoStreamer, Streamer};
|
use fst::{Automaton, IntoStreamer, Streamer};
|
||||||
use heed::types::DecodeIgnore;
|
use heed::types::DecodeIgnore;
|
||||||
use heed::RoTxn;
|
use heed::BytesDecode;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
|
||||||
use super::interner::{DedupInterner, Interned};
|
use super::interner::{DedupInterner, Interned};
|
||||||
use super::{limits, SearchContext};
|
use super::{limits, SearchContext};
|
||||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||||
use crate::search::{build_dfa, get_first};
|
use crate::search::{build_dfa, get_first};
|
||||||
use crate::{CboRoaringBitmapLenCodec, Index, Result, MAX_WORD_LENGTH};
|
use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH};
|
||||||
|
|
||||||
/// A phrase in the user's search query, consisting of several words
|
/// A phrase in the user's search query, consisting of several words
|
||||||
/// that must appear side-by-side in the search results.
|
/// that must appear side-by-side in the search results.
|
||||||
@ -191,18 +191,13 @@ impl QueryTermSubset {
|
|||||||
&self,
|
&self,
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
) -> Result<BTreeSet<Interned<String>>> {
|
) -> Result<BTreeSet<Interned<String>>> {
|
||||||
let original = ctx.term_interner.get_mut(self.original);
|
|
||||||
let mut result = BTreeSet::default();
|
let mut result = BTreeSet::default();
|
||||||
// TODO: a compute_partially funtion
|
// TODO: a compute_partially funtion
|
||||||
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
|
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
|
||||||
original.compute_fully_if_needed(
|
self.original.compute_fully_if_needed(ctx)?;
|
||||||
ctx.index,
|
|
||||||
ctx.txn,
|
|
||||||
&mut ctx.word_interner,
|
|
||||||
&mut ctx.phrase_interner,
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let original = ctx.term_interner.get_mut(self.original);
|
||||||
if !self.zero_typo_subset.is_empty() {
|
if !self.zero_typo_subset.is_empty() {
|
||||||
let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } =
|
let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } =
|
||||||
&original.zero_typo;
|
&original.zero_typo;
|
||||||
@ -245,18 +240,13 @@ impl QueryTermSubset {
|
|||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result<BTreeSet<Interned<Phrase>>> {
|
pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result<BTreeSet<Interned<Phrase>>> {
|
||||||
let original = ctx.term_interner.get_mut(self.original);
|
|
||||||
let mut result = BTreeSet::default();
|
let mut result = BTreeSet::default();
|
||||||
|
|
||||||
if !self.one_typo_subset.is_empty() {
|
if !self.one_typo_subset.is_empty() {
|
||||||
// TODO: compute less than fully if possible
|
// TODO: compute less than fully if possible
|
||||||
original.compute_fully_if_needed(
|
self.original.compute_fully_if_needed(ctx)?;
|
||||||
ctx.index,
|
|
||||||
ctx.txn,
|
|
||||||
&mut ctx.word_interner,
|
|
||||||
&mut ctx.phrase_interner,
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
|
let original = ctx.term_interner.get_mut(self.original);
|
||||||
|
|
||||||
let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } =
|
let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } =
|
||||||
&original.zero_typo;
|
&original.zero_typo;
|
||||||
@ -274,26 +264,23 @@ impl QueryTermSubset {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl QueryTerm {
|
impl Interned<QueryTerm> {
|
||||||
pub fn compute_fully_if_needed(
|
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
&mut self,
|
let s = ctx.term_interner.get_mut(self);
|
||||||
index: &Index,
|
if s.max_nbr_typos == 0 {
|
||||||
txn: &RoTxn,
|
s.one_typo = Lazy::Init(OneTypoTerm::default());
|
||||||
word_interner: &mut DedupInterner<String>,
|
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
||||||
phrase_interner: &mut DedupInterner<Phrase>,
|
} else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() {
|
||||||
) -> Result<()> {
|
assert!(s.two_typo.is_uninit());
|
||||||
if self.max_nbr_typos == 0 {
|
self.initialize_one_typo_subterm(ctx)?;
|
||||||
self.one_typo = Lazy::Init(OneTypoTerm::default());
|
let s = ctx.term_interner.get_mut(self);
|
||||||
self.two_typo = Lazy::Init(TwoTypoTerm::default());
|
assert!(s.one_typo.is_init());
|
||||||
} else if self.max_nbr_typos == 1 && self.one_typo.is_uninit() {
|
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
||||||
assert!(self.two_typo.is_uninit());
|
} else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() {
|
||||||
self.initialize_one_typo_subterm(index, txn, word_interner, phrase_interner)?;
|
assert!(s.two_typo.is_uninit());
|
||||||
assert!(self.one_typo.is_init());
|
self.initialize_one_and_two_typo_subterm(ctx)?;
|
||||||
self.two_typo = Lazy::Init(TwoTypoTerm::default());
|
let s = ctx.term_interner.get_mut(self);
|
||||||
} else if self.max_nbr_typos > 1 && self.two_typo.is_uninit() {
|
assert!(s.one_typo.is_init() && s.two_typo.is_init());
|
||||||
assert!(self.two_typo.is_uninit());
|
|
||||||
self.initialize_one_and_two_typo_subterm(index, txn, word_interner, phrase_interner)?;
|
|
||||||
assert!(self.one_typo.is_init() && self.two_typo.is_init());
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -302,7 +289,7 @@ impl QueryTerm {
|
|||||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct QueryTerm {
|
pub struct QueryTerm {
|
||||||
pub original: Interned<String>,
|
pub original: Interned<String>,
|
||||||
pub is_multiple_words: bool,
|
pub ngram_words: Option<Vec<Interned<String>>>,
|
||||||
pub max_nbr_typos: u8,
|
pub max_nbr_typos: u8,
|
||||||
pub is_prefix: bool,
|
pub is_prefix: bool,
|
||||||
pub zero_typo: ZeroTypoTerm,
|
pub zero_typo: ZeroTypoTerm,
|
||||||
@ -363,39 +350,6 @@ impl TwoTypoTerm {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl QueryTerm {
|
impl QueryTerm {
|
||||||
pub fn phrase(
|
|
||||||
word_interner: &mut DedupInterner<String>,
|
|
||||||
phrase_interner: &mut DedupInterner<Phrase>,
|
|
||||||
phrase: Phrase,
|
|
||||||
) -> Self {
|
|
||||||
Self {
|
|
||||||
original: word_interner.insert(phrase.description(word_interner)),
|
|
||||||
is_multiple_words: false,
|
|
||||||
max_nbr_typos: 0,
|
|
||||||
is_prefix: false,
|
|
||||||
zero_typo: ZeroTypoTerm {
|
|
||||||
phrase: Some(phrase_interner.insert(phrase)),
|
|
||||||
zero_typo: None,
|
|
||||||
prefix_of: BTreeSet::default(),
|
|
||||||
synonyms: BTreeSet::default(),
|
|
||||||
use_prefix_db: None,
|
|
||||||
},
|
|
||||||
one_typo: Lazy::Uninit,
|
|
||||||
two_typo: Lazy::Uninit,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub fn empty(word_interner: &mut DedupInterner<String>, original: &str) -> Self {
|
|
||||||
Self {
|
|
||||||
original: word_interner.insert(original.to_owned()),
|
|
||||||
is_multiple_words: false,
|
|
||||||
is_prefix: false,
|
|
||||||
max_nbr_typos: 0,
|
|
||||||
zero_typo: <_>::default(),
|
|
||||||
one_typo: Lazy::Init(<_>::default()),
|
|
||||||
two_typo: Lazy::Init(<_>::default()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
let Lazy::Init(one_typo) = &self.one_typo else {
|
let Lazy::Init(one_typo) = &self.one_typo else {
|
||||||
return false;
|
return false;
|
||||||
@ -438,13 +392,13 @@ fn find_zero_typo_prefix_derivations(
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn find_zero_one_typo_derivations(
|
fn find_zero_one_typo_derivations(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
word_interned: Interned<String>,
|
word_interned: Interned<String>,
|
||||||
is_prefix: bool,
|
is_prefix: bool,
|
||||||
fst: fst::Set<Cow<[u8]>>,
|
|
||||||
word_interner: &mut DedupInterner<String>,
|
|
||||||
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
|
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let word = word_interner.get(word_interned).to_owned();
|
let fst = ctx.get_words_fst()?;
|
||||||
|
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||||
let word = word.as_str();
|
let word = word.as_str();
|
||||||
|
|
||||||
let dfa = build_dfa(word, 1, is_prefix);
|
let dfa = build_dfa(word, 1, is_prefix);
|
||||||
@ -453,7 +407,7 @@ fn find_zero_one_typo_derivations(
|
|||||||
|
|
||||||
while let Some((derived_word, state)) = stream.next() {
|
while let Some((derived_word, state)) = stream.next() {
|
||||||
let derived_word = std::str::from_utf8(derived_word)?;
|
let derived_word = std::str::from_utf8(derived_word)?;
|
||||||
let derived_word = word_interner.insert(derived_word.to_owned());
|
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
||||||
let d = dfa.distance(state.1);
|
let d = dfa.distance(state.1);
|
||||||
match d.to_u8() {
|
match d.to_u8() {
|
||||||
0 => {
|
0 => {
|
||||||
@ -553,7 +507,17 @@ fn partially_initialized_term_from_word(
|
|||||||
let word_interned = ctx.word_interner.insert(word.to_owned());
|
let word_interned = ctx.word_interner.insert(word.to_owned());
|
||||||
|
|
||||||
if word.len() > MAX_WORD_LENGTH {
|
if word.len() > MAX_WORD_LENGTH {
|
||||||
return Ok(QueryTerm::empty(&mut ctx.word_interner, word));
|
return Ok({
|
||||||
|
QueryTerm {
|
||||||
|
original: ctx.word_interner.insert(word.to_owned()),
|
||||||
|
ngram_words: None,
|
||||||
|
is_prefix: false,
|
||||||
|
max_nbr_typos: 0,
|
||||||
|
zero_typo: <_>::default(),
|
||||||
|
one_typo: Lazy::Init(<_>::default()),
|
||||||
|
two_typo: Lazy::Init(<_>::default()),
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let fst = ctx.index.words_fst(ctx.txn)?;
|
let fst = ctx.index.words_fst(ctx.txn)?;
|
||||||
@ -610,7 +574,7 @@ fn partially_initialized_term_from_word(
|
|||||||
|
|
||||||
Ok(QueryTerm {
|
Ok(QueryTerm {
|
||||||
original: word_interned,
|
original: word_interned,
|
||||||
is_multiple_words: false,
|
ngram_words: None,
|
||||||
max_nbr_typos: max_typo,
|
max_nbr_typos: max_typo,
|
||||||
is_prefix,
|
is_prefix,
|
||||||
zero_typo,
|
zero_typo,
|
||||||
@ -619,72 +583,52 @@ fn partially_initialized_term_from_word(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn find_split_words(
|
fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Interned<Phrase>>> {
|
||||||
index: &Index,
|
if let Some((l, r)) = split_best_frequency(ctx, word)? {
|
||||||
txn: &RoTxn,
|
Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
|
||||||
word_interner: &mut DedupInterner<String>,
|
} else {
|
||||||
phrase_interner: &mut DedupInterner<Phrase>,
|
Ok(None)
|
||||||
word: &str,
|
}
|
||||||
) -> Result<Option<Interned<Phrase>>> {
|
|
||||||
let split_words = split_best_frequency(index, txn, word)?.map(|(l, r)| {
|
|
||||||
phrase_interner.insert(Phrase {
|
|
||||||
words: vec![Some(word_interner.insert(l)), Some(word_interner.insert(r))],
|
|
||||||
})
|
|
||||||
});
|
|
||||||
Ok(split_words)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl QueryTerm {
|
impl Interned<QueryTerm> {
|
||||||
fn initialize_one_typo_subterm(
|
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
&mut self,
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
index: &Index,
|
let QueryTerm { original, is_prefix, one_typo, .. } = self_mut;
|
||||||
txn: &RoTxn,
|
let original = *original;
|
||||||
word_interner: &mut DedupInterner<String>,
|
let is_prefix = *is_prefix;
|
||||||
phrase_interner: &mut DedupInterner<Phrase>,
|
// let original_str = ctx.word_interner.get(*original).to_owned();
|
||||||
) -> Result<()> {
|
|
||||||
let QueryTerm { original, is_prefix, one_typo, .. } = self;
|
|
||||||
let original_str = word_interner.get(*original).to_owned();
|
|
||||||
if one_typo.is_init() {
|
if one_typo.is_init() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
let mut one_typo_words = BTreeSet::new();
|
let mut one_typo_words = BTreeSet::new();
|
||||||
|
|
||||||
find_zero_one_typo_derivations(
|
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
|
||||||
*original,
|
match nbr_typos {
|
||||||
*is_prefix,
|
ZeroOrOneTypo::Zero => {}
|
||||||
index.words_fst(txn)?,
|
ZeroOrOneTypo::One => {
|
||||||
word_interner,
|
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||||
|derived_word, nbr_typos| {
|
one_typo_words.insert(derived_word);
|
||||||
match nbr_typos {
|
} else {
|
||||||
ZeroOrOneTypo::Zero => {}
|
return Ok(ControlFlow::Break(()));
|
||||||
ZeroOrOneTypo::One => {
|
|
||||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
|
||||||
one_typo_words.insert(derived_word);
|
|
||||||
} else {
|
|
||||||
return Ok(ControlFlow::Break(()));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(ControlFlow::Continue(()))
|
}
|
||||||
},
|
Ok(ControlFlow::Continue(()))
|
||||||
)?;
|
})?;
|
||||||
let split_words =
|
let original_str = ctx.word_interner.get(original).to_owned();
|
||||||
find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?;
|
let split_words = find_split_words(ctx, original_str.as_str())?;
|
||||||
let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words };
|
let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words };
|
||||||
|
|
||||||
self.one_typo = Lazy::Init(one_typo);
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
|
self_mut.one_typo = Lazy::Init(one_typo);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
fn initialize_one_and_two_typo_subterm(
|
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
&mut self,
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
index: &Index,
|
let QueryTerm { original, is_prefix, two_typo, .. } = self_mut;
|
||||||
txn: &RoTxn,
|
let original_str = ctx.word_interner.get(*original).to_owned();
|
||||||
word_interner: &mut DedupInterner<String>,
|
|
||||||
phrase_interner: &mut DedupInterner<Phrase>,
|
|
||||||
) -> Result<()> {
|
|
||||||
let QueryTerm { original, is_prefix, two_typo, .. } = self;
|
|
||||||
let original_str = word_interner.get(*original).to_owned();
|
|
||||||
if two_typo.is_init() {
|
if two_typo.is_init() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
@ -694,8 +638,8 @@ impl QueryTerm {
|
|||||||
find_zero_one_two_typo_derivations(
|
find_zero_one_two_typo_derivations(
|
||||||
*original,
|
*original,
|
||||||
*is_prefix,
|
*is_prefix,
|
||||||
index.words_fst(txn)?,
|
ctx.index.words_fst(ctx.txn)?,
|
||||||
word_interner,
|
&mut ctx.word_interner,
|
||||||
|derived_word, nbr_typos| {
|
|derived_word, nbr_typos| {
|
||||||
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
|
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
|
||||||
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
|
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
|
||||||
@ -719,14 +663,15 @@ impl QueryTerm {
|
|||||||
Ok(ControlFlow::Continue(()))
|
Ok(ControlFlow::Continue(()))
|
||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
let split_words =
|
let split_words = find_split_words(ctx, original_str.as_str())?;
|
||||||
find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?;
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
|
|
||||||
let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words };
|
let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words };
|
||||||
|
|
||||||
let two_typo = TwoTypoTerm { two_typos: two_typo_words };
|
let two_typo = TwoTypoTerm { two_typos: two_typo_words };
|
||||||
|
|
||||||
self.one_typo = Lazy::Init(one_typo);
|
self_mut.one_typo = Lazy::Init(one_typo);
|
||||||
self.two_typo = Lazy::Init(two_typo);
|
self_mut.two_typo = Lazy::Init(two_typo);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -737,38 +682,37 @@ impl QueryTerm {
|
|||||||
///
|
///
|
||||||
/// Return `None` if the original word cannot be split.
|
/// Return `None` if the original word cannot be split.
|
||||||
fn split_best_frequency(
|
fn split_best_frequency(
|
||||||
index: &Index,
|
ctx: &mut SearchContext,
|
||||||
txn: &RoTxn,
|
|
||||||
original: &str,
|
original: &str,
|
||||||
) -> Result<Option<(String, String)>> {
|
) -> Result<Option<(Interned<String>, Interned<String>)>> {
|
||||||
let chars = original.char_indices().skip(1);
|
let chars = original.char_indices().skip(1);
|
||||||
let mut best = None;
|
let mut best = None;
|
||||||
|
|
||||||
for (i, _) in chars {
|
for (i, _) in chars {
|
||||||
let (left, right) = original.split_at(i);
|
let (left, right) = original.split_at(i);
|
||||||
|
let left = ctx.word_interner.insert(left.to_owned());
|
||||||
|
let right = ctx.word_interner.insert(right.to_owned());
|
||||||
|
|
||||||
let key = (1, left, right);
|
if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? {
|
||||||
let frequency = index
|
let frequency =
|
||||||
.word_pair_proximity_docids
|
CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?;
|
||||||
.remap_data_type::<CboRoaringBitmapLenCodec>()
|
if best.map_or(true, |(old, _, _)| frequency > old) {
|
||||||
.get(txn, &key)?
|
best = Some((frequency, left, right));
|
||||||
.unwrap_or(0);
|
}
|
||||||
|
|
||||||
if frequency != 0 && best.map_or(true, |(old, _, _)| frequency > old) {
|
|
||||||
best = Some((frequency, left, right));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned())))
|
Ok(best.map(|(_, left, right)| (left, right)))
|
||||||
}
|
}
|
||||||
|
|
||||||
impl QueryTerm {
|
impl Interned<QueryTerm> {
|
||||||
/// Return the original word from the given query term
|
/// Return the original word from the given query term
|
||||||
pub fn original_single_word(&self) -> Option<Interned<String>> {
|
pub fn original_single_word(self, ctx: &SearchContext) -> Option<Interned<String>> {
|
||||||
if self.is_multiple_words {
|
let self_ = ctx.term_interner.get(self);
|
||||||
|
if self_.ngram_words.is_some() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some(self.original)
|
Some(self_.original)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -824,11 +768,25 @@ impl PhraseBuilder {
|
|||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
Some(LocatedQueryTerm {
|
Some(LocatedQueryTerm {
|
||||||
value: ctx.term_interner.push(QueryTerm::phrase(
|
value: ctx.term_interner.push({
|
||||||
&mut ctx.word_interner,
|
let phrase = Phrase { words: self.words };
|
||||||
&mut ctx.phrase_interner,
|
let phrase_desc = phrase.description(&ctx.word_interner);
|
||||||
Phrase { words: self.words },
|
QueryTerm {
|
||||||
)),
|
original: ctx.word_interner.insert(phrase_desc),
|
||||||
|
ngram_words: None,
|
||||||
|
max_nbr_typos: 0,
|
||||||
|
is_prefix: false,
|
||||||
|
zero_typo: ZeroTypoTerm {
|
||||||
|
phrase: Some(ctx.phrase_interner.insert(phrase)),
|
||||||
|
zero_typo: None,
|
||||||
|
prefix_of: BTreeSet::default(),
|
||||||
|
synonyms: BTreeSet::default(),
|
||||||
|
use_prefix_db: None,
|
||||||
|
},
|
||||||
|
one_typo: Lazy::Uninit,
|
||||||
|
two_typo: Lazy::Uninit,
|
||||||
|
}
|
||||||
|
}),
|
||||||
positions: self.start..=self.end,
|
positions: self.start..=self.end,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -1001,7 +959,7 @@ pub fn make_ngram(
|
|||||||
}
|
}
|
||||||
let mut words_interned = vec![];
|
let mut words_interned = vec![];
|
||||||
for term in terms {
|
for term in terms {
|
||||||
if let Some(original_term_word) = ctx.term_interner.get(term.value).original_single_word() {
|
if let Some(original_term_word) = term.value.original_single_word(ctx) {
|
||||||
words_interned.push(original_term_word);
|
words_interned.push(original_term_word);
|
||||||
} else {
|
} else {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
@ -1036,7 +994,7 @@ pub fn make_ngram(
|
|||||||
|
|
||||||
let term = QueryTerm {
|
let term = QueryTerm {
|
||||||
original: ngram_str_interned,
|
original: ngram_str_interned,
|
||||||
is_multiple_words: true,
|
ngram_words: Some(words_interned),
|
||||||
is_prefix,
|
is_prefix,
|
||||||
max_nbr_typos,
|
max_nbr_typos,
|
||||||
zero_typo: term.zero_typo,
|
zero_typo: term.zero_typo,
|
||||||
|
Loading…
Reference in New Issue
Block a user