Merge branch 'search-refactor-typo-attributes' into search-refactor

This commit is contained in:
Loïc Lecrenier 2023-04-12 16:47:31 +02:00
commit 644e136aee
10 changed files with 254 additions and 95 deletions

View File

@ -4,10 +4,14 @@ use std::hash::Hash;
use fxhash::FxHashMap; use fxhash::FxHashMap;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use heed::{BytesEncode, Database, RoTxn}; use heed::{BytesDecode, BytesEncode, Database, RoTxn};
use roaring::RoaringBitmap;
use super::interner::Interned; use super::interner::Interned;
use crate::{Result, SearchContext}; use super::Word;
use crate::{
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
};
/// A cache storing pointers to values in the LMDB databases. /// A cache storing pointers to values in the LMDB databases.
/// ///
@ -25,6 +29,7 @@ pub struct DatabaseCache<'ctx> {
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>, pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
pub exact_word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>, pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>, pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
@ -64,28 +69,103 @@ impl<'ctx> SearchContext<'ctx> {
} }
} }
pub fn word_docids(&mut self, word: Word) -> Result<Option<RoaringBitmap>> {
match word {
Word::Original(word) => {
let exact = self.get_db_exact_word_docids(word)?;
let tolerant = self.get_db_word_docids(word)?;
Ok(match (exact, tolerant) {
(None, None) => None,
(None, Some(tolerant)) => Some(tolerant),
(Some(exact), None) => Some(exact),
(Some(exact), Some(tolerant)) => {
let mut both = exact;
both |= tolerant;
Some(both)
}
})
}
Word::Derived(word) => self.get_db_word_docids(word),
}
}
/// Retrieve or insert the given value in the `word_docids` database. /// Retrieve or insert the given value in the `word_docids` database.
pub fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'ctx [u8]>> { fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value( DatabaseCache::get_value(
self.txn, self.txn,
word, word,
self.word_interner.get(word).as_str(), self.word_interner.get(word).as_str(),
&mut self.db_cache.word_docids, &mut self.db_cache.word_docids,
self.index.word_docids.remap_data_type::<ByteSlice>(), self.index.word_docids.remap_data_type::<ByteSlice>(),
) )?
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
} }
fn get_db_exact_word_docids(
&mut self,
word: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value(
self.txn,
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.exact_word_docids,
self.index.exact_word_docids.remap_data_type::<ByteSlice>(),
)?
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
}
pub fn word_prefix_docids(&mut self, prefix: Word) -> Result<Option<RoaringBitmap>> {
match prefix {
Word::Original(prefix) => {
let exact = self.get_db_exact_word_prefix_docids(prefix)?;
let tolerant = self.get_db_word_prefix_docids(prefix)?;
Ok(match (exact, tolerant) {
(None, None) => None,
(None, Some(tolerant)) => Some(tolerant),
(Some(exact), None) => Some(exact),
(Some(exact), Some(tolerant)) => {
let mut both = exact;
both |= tolerant;
Some(both)
}
})
}
Word::Derived(prefix) => self.get_db_word_prefix_docids(prefix),
}
}
/// Retrieve or insert the given value in the `word_prefix_docids` database. /// Retrieve or insert the given value in the `word_prefix_docids` database.
pub fn get_db_word_prefix_docids( fn get_db_word_prefix_docids(
&mut self, &mut self,
prefix: Interned<String>, prefix: Interned<String>,
) -> Result<Option<&'ctx [u8]>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value( DatabaseCache::get_value(
self.txn, self.txn,
prefix, prefix,
self.word_interner.get(prefix).as_str(), self.word_interner.get(prefix).as_str(),
&mut self.db_cache.word_prefix_docids, &mut self.db_cache.word_prefix_docids,
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(), self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
) )?
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
}
fn get_db_exact_word_prefix_docids(
&mut self,
prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
&mut self.db_cache.exact_word_prefix_docids,
self.index.exact_word_prefix_docids.remap_data_type::<ByteSlice>(),
)?
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
} }
pub fn get_db_word_pair_proximity_docids( pub fn get_db_word_pair_proximity_docids(
@ -93,7 +173,7 @@ impl<'ctx> SearchContext<'ctx> {
word1: Interned<String>, word1: Interned<String>,
word2: Interned<String>, word2: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<&'ctx [u8]>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value( DatabaseCache::get_value(
self.txn, self.txn,
(proximity, word1, word2), (proximity, word1, word2),
@ -104,7 +184,32 @@ impl<'ctx> SearchContext<'ctx> {
), ),
&mut self.db_cache.word_pair_proximity_docids, &mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(), self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
) )?
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
}
pub fn get_db_word_pair_proximity_docids_len(
&mut self,
word1: Interned<String>,
word2: Interned<String>,
proximity: u8,
) -> Result<Option<u64>> {
DatabaseCache::get_value(
self.txn,
(proximity, word1, word2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(word2).as_str(),
),
&mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
)?
.map(|bytes| {
CboRoaringBitmapLenCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())
})
.transpose()
} }
pub fn get_db_word_prefix_pair_proximity_docids( pub fn get_db_word_prefix_pair_proximity_docids(
@ -112,7 +217,7 @@ impl<'ctx> SearchContext<'ctx> {
word1: Interned<String>, word1: Interned<String>,
prefix2: Interned<String>, prefix2: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<&'ctx [u8]>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value( DatabaseCache::get_value(
self.txn, self.txn,
(proximity, word1, prefix2), (proximity, word1, prefix2),
@ -123,14 +228,16 @@ impl<'ctx> SearchContext<'ctx> {
), ),
&mut self.db_cache.word_prefix_pair_proximity_docids, &mut self.db_cache.word_prefix_pair_proximity_docids,
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(), self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
) )?
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
} }
pub fn get_db_prefix_word_pair_proximity_docids( pub fn get_db_prefix_word_pair_proximity_docids(
&mut self, &mut self,
left_prefix: Interned<String>, left_prefix: Interned<String>,
right: Interned<String>, right: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<&'ctx [u8]>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value( DatabaseCache::get_value(
self.txn, self.txn,
(proximity, left_prefix, right), (proximity, left_prefix, right),
@ -141,34 +248,40 @@ impl<'ctx> SearchContext<'ctx> {
), ),
&mut self.db_cache.prefix_word_pair_proximity_docids, &mut self.db_cache.prefix_word_pair_proximity_docids,
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(), self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
) )?
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
} }
pub fn get_db_word_position_docids( pub fn get_db_word_position_docids(
&mut self, &mut self,
word: Interned<String>, word: Interned<String>,
position: u16, position: u16,
) -> Result<Option<&'ctx [u8]>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value( DatabaseCache::get_value(
self.txn, self.txn,
(word, position), (word, position),
&(self.word_interner.get(word).as_str(), position), &(self.word_interner.get(word).as_str(), position),
&mut self.db_cache.word_position_docids, &mut self.db_cache.word_position_docids,
self.index.word_position_docids.remap_data_type::<ByteSlice>(), self.index.word_position_docids.remap_data_type::<ByteSlice>(),
) )?
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
} }
pub fn get_db_word_fid_docids( pub fn get_db_word_fid_docids(
&mut self, &mut self,
word: Interned<String>, word: Interned<String>,
fid: u16, fid: u16,
) -> Result<Option<&'ctx [u8]>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value( DatabaseCache::get_value(
self.txn, self.txn,
(word, fid), (word, fid),
&(self.word_interner.get(word).as_str(), fid), &(self.word_interner.get(word).as_str(), fid),
&mut self.db_cache.word_fid_docids, &mut self.db_cache.word_fid_docids,
self.index.word_fid_docids.remap_data_type::<ByteSlice>(), self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
) )?
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
} }
} }

View File

@ -1,11 +1,10 @@
use heed::BytesDecode;
use roaring::{MultiOps, RoaringBitmap}; use roaring::{MultiOps, RoaringBitmap};
use super::query_graph::QueryGraph; use super::query_graph::QueryGraph;
use super::ranking_rules::{RankingRule, RankingRuleOutput}; use super::ranking_rules::{RankingRule, RankingRuleOutput};
use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::ExactTerm; use crate::search::new::query_term::ExactTerm;
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; use crate::{Result, SearchContext, SearchLogger};
/// A ranking rule that produces 3 disjoint buckets: /// A ranking rule that produces 3 disjoint buckets:
/// ///
@ -161,10 +160,8 @@ impl State {
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
// longer phrases we'll be losing on precision here. // longer phrases we'll be losing on precision here.
let bucketed_position = crate::bucketed_position(position + offset); let bucketed_position = crate::bucketed_position(position + offset);
let word_position_docids = CboRoaringBitmapCodec::bytes_decode( let word_position_docids =
ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(), ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default();
)
.unwrap_or_default();
candidates &= word_position_docids; candidates &= word_position_docids;
if candidates.is_empty() { if candidates.is_empty() {
return Ok(State::Empty(query_graph.clone())); return Ok(State::Empty(query_graph.clone()));
@ -191,11 +188,7 @@ impl State {
// ignore stop words words in phrases // ignore stop words words in phrases
.flatten() .flatten()
.map(|word| -> Result<_> { .map(|word| -> Result<_> {
Ok(ctx Ok(ctx.get_db_word_fid_docids(*word, fid)?.unwrap_or_default())
.get_db_word_fid_docids(*word, fid)?
.map(CboRoaringBitmapCodec::bytes_decode)
.unwrap_or_default()
.unwrap_or_default())
}), }),
)?; )?;
intersection &= &candidates; intersection &= &candidates;

View File

@ -427,14 +427,14 @@ fill: \"#B6E2D3\"
)?; )?;
for w in term_subset.all_single_words_except_prefix_db(ctx)? { for w in term_subset.all_single_words_except_prefix_db(ctx)? {
let w = ctx.word_interner.get(w); let w = ctx.word_interner.get(w.interned());
writeln!(file, "{w}: word")?; writeln!(file, "{w}: word")?;
} }
for p in term_subset.all_phrases(ctx)? { for p in term_subset.all_phrases(ctx)? {
writeln!(file, "{}: phrase", p.description(ctx))?; writeln!(file, "{}: phrase", p.description(ctx))?;
} }
if let Some(w) = term_subset.use_prefix_db(ctx) { if let Some(w) = term_subset.use_prefix_db(ctx) {
let w = ctx.word_interner.get(w); let w = ctx.word_interner.get(w.interned());
writeln!(file, "{w}: prefix db")?; writeln!(file, "{w}: prefix db")?;
} }

View File

@ -50,6 +50,8 @@ use ranking_rules::{BoxRankingRule, RankingRule};
use resolve_query_graph::compute_query_graph_docids; use resolve_query_graph::compute_query_graph_docids;
use sort::Sort; use sort::Sort;
use self::interner::Interned;
/// A structure used throughout the execution of a search query. /// A structure used throughout the execution of a search query.
pub struct SearchContext<'ctx> { pub struct SearchContext<'ctx> {
pub index: &'ctx Index, pub index: &'ctx Index,
@ -75,6 +77,21 @@ impl<'ctx> SearchContext<'ctx> {
} }
} }
#[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)]
pub enum Word {
Original(Interned<String>),
Derived(Interned<String>),
}
impl Word {
pub fn interned(&self) -> Interned<String> {
match self {
Word::Original(word) => *word,
Word::Derived(word) => *word,
}
}
}
/// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it. /// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it.
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
fn resolve_maximally_reduced_query_graph( fn resolve_maximally_reduced_query_graph(

View File

@ -1,17 +1,17 @@
use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer};
use heed::types::DecodeIgnore;
use heed::BytesDecode;
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::BTreeSet; use std::collections::BTreeSet;
use std::ops::ControlFlow; use std::ops::ControlFlow;
use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer};
use heed::types::DecodeIgnore;
use super::*; use super::*;
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
use crate::search::new::query_term::TwoTypoTerm; use crate::search::new::query_term::TwoTypoTerm;
use crate::search::new::{limits, SearchContext}; use crate::search::new::{limits, SearchContext};
use crate::search::{build_dfa, get_first}; use crate::search::{build_dfa, get_first};
use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; use crate::{Result, MAX_WORD_LENGTH};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NumberOfTypos { pub enum NumberOfTypos {
@ -177,6 +177,7 @@ pub fn partially_initialized_term_from_word(
word: &str, word: &str,
max_typo: u8, max_typo: u8,
is_prefix: bool, is_prefix: bool,
is_ngram: bool,
) -> Result<QueryTerm> { ) -> Result<QueryTerm> {
let word_interned = ctx.word_interner.insert(word.to_owned()); let word_interned = ctx.word_interner.insert(word.to_owned());
@ -197,12 +198,19 @@ pub fn partially_initialized_term_from_word(
let fst = ctx.index.words_fst(ctx.txn)?; let fst = ctx.index.words_fst(ctx.txn)?;
let use_prefix_db = is_prefix let use_prefix_db = is_prefix
&& ctx && (ctx
.index .index
.word_prefix_docids .word_prefix_docids
.remap_data_type::<DecodeIgnore>() .remap_data_type::<DecodeIgnore>()
.get(ctx.txn, word)? .get(ctx.txn, word)?
.is_some(); .is_some()
|| (!is_ngram
&& ctx
.index
.exact_word_prefix_docids
.remap_data_type::<DecodeIgnore>()
.get(ctx.txn, word)?
.is_some()));
let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };
let mut zero_typo = None; let mut zero_typo = None;
@ -385,9 +393,7 @@ fn split_best_frequency(
let left = ctx.word_interner.insert(left.to_owned()); let left = ctx.word_interner.insert(left.to_owned());
let right = ctx.word_interner.insert(right.to_owned()); let right = ctx.word_interner.insert(right.to_owned());
if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
let frequency =
CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?;
if best.map_or(true, |(old, _, _)| frequency > old) { if best.map_or(true, |(old, _, _)| frequency > old) {
best = Some((frequency, left, right)); best = Some((frequency, left, right));
} }

View File

@ -3,18 +3,18 @@ mod ntypo_subset;
mod parse_query; mod parse_query;
mod phrase; mod phrase;
use super::interner::{DedupInterner, Interned};
use super::{limits, SearchContext};
use crate::Result;
use std::collections::BTreeSet; use std::collections::BTreeSet;
use std::ops::RangeInclusive; use std::ops::RangeInclusive;
use compute_derivations::partially_initialized_term_from_word;
use either::Either; use either::Either;
pub use ntypo_subset::NTypoTermSubset; pub use ntypo_subset::NTypoTermSubset;
pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed}; pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed};
pub use phrase::Phrase; pub use phrase::Phrase;
use compute_derivations::partially_initialized_term_from_word; use super::interner::{DedupInterner, Interned};
use super::{limits, SearchContext, Word};
use crate::Result;
/// A set of word derivations attached to a location in the search query. /// A set of word derivations attached to a location in the search query.
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
@ -159,12 +159,12 @@ impl QueryTermSubset {
self.two_typo_subset.intersect(&other.two_typo_subset); self.two_typo_subset.intersect(&other.two_typo_subset);
} }
pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Interned<String>> { pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Word> {
let original = ctx.term_interner.get(self.original); let original = ctx.term_interner.get(self.original);
let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { let Some(use_prefix_db) = original.zero_typo.use_prefix_db else {
return None return None
}; };
match &self.zero_typo_subset { let word = match &self.zero_typo_subset {
NTypoTermSubset::All => Some(use_prefix_db), NTypoTermSubset::All => Some(use_prefix_db),
NTypoTermSubset::Subset { words, phrases: _ } => { NTypoTermSubset::Subset { words, phrases: _ } => {
// TODO: use a subset of prefix words instead // TODO: use a subset of prefix words instead
@ -175,12 +175,19 @@ impl QueryTermSubset {
} }
} }
NTypoTermSubset::Nothing => None, NTypoTermSubset::Nothing => None,
};
word.map(|word| {
if original.ngram_words.is_some() {
Word::Derived(word)
} else {
Word::Original(word)
} }
})
} }
pub fn all_single_words_except_prefix_db( pub fn all_single_words_except_prefix_db(
&self, &self,
ctx: &mut SearchContext, ctx: &mut SearchContext,
) -> Result<BTreeSet<Interned<String>>> { ) -> Result<BTreeSet<Word>> {
let mut result = BTreeSet::default(); let mut result = BTreeSet::default();
// TODO: a compute_partially funtion // TODO: a compute_partially funtion
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
@ -197,8 +204,20 @@ impl QueryTermSubset {
synonyms: _, synonyms: _,
use_prefix_db: _, use_prefix_db: _,
} = &original.zero_typo; } = &original.zero_typo;
result.extend(zero_typo.iter().copied()); result.extend(zero_typo.iter().copied().map(|w| {
result.extend(prefix_of.iter().copied()); if original.ngram_words.is_some() {
Word::Derived(w)
} else {
Word::Original(w)
}
}));
result.extend(prefix_of.iter().copied().map(|w| {
if original.ngram_words.is_some() {
Word::Derived(w)
} else {
Word::Original(w)
}
}));
} }
NTypoTermSubset::Subset { words, phrases: _ } => { NTypoTermSubset::Subset { words, phrases: _ } => {
let ZeroTypoTerm { let ZeroTypoTerm {
@ -210,10 +229,20 @@ impl QueryTermSubset {
} = &original.zero_typo; } = &original.zero_typo;
if let Some(zero_typo) = zero_typo { if let Some(zero_typo) = zero_typo {
if words.contains(zero_typo) { if words.contains(zero_typo) {
result.insert(*zero_typo); if original.ngram_words.is_some() {
result.insert(Word::Derived(*zero_typo));
} else {
result.insert(Word::Original(*zero_typo));
} }
} }
result.extend(prefix_of.intersection(words).copied()); }
result.extend(prefix_of.intersection(words).copied().map(|w| {
if original.ngram_words.is_some() {
Word::Derived(w)
} else {
Word::Original(w)
}
}));
} }
NTypoTermSubset::Nothing => {} NTypoTermSubset::Nothing => {}
} }
@ -223,13 +252,13 @@ impl QueryTermSubset {
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
panic!() panic!()
}; };
result.extend(one_typo.iter().copied()) result.extend(one_typo.iter().copied().map(Word::Derived))
} }
NTypoTermSubset::Subset { words, phrases: _ } => { NTypoTermSubset::Subset { words, phrases: _ } => {
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
panic!() panic!()
}; };
result.extend(one_typo.intersection(words)); result.extend(one_typo.intersection(words).copied().map(Word::Derived));
} }
NTypoTermSubset::Nothing => {} NTypoTermSubset::Nothing => {}
}; };
@ -239,13 +268,13 @@ impl QueryTermSubset {
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
panic!() panic!()
}; };
result.extend(two_typos.iter().copied()); result.extend(two_typos.iter().copied().map(Word::Derived));
} }
NTypoTermSubset::Subset { words, phrases: _ } => { NTypoTermSubset::Subset { words, phrases: _ } => {
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
panic!() panic!()
}; };
result.extend(two_typos.intersection(words)); result.extend(two_typos.intersection(words).copied().map(Word::Derived));
} }
NTypoTermSubset::Nothing => {} NTypoTermSubset::Nothing => {}
}; };

View File

@ -1,8 +1,8 @@
use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind}; use charabia::normalizer::NormalizedTokenIter;
use charabia::{SeparatorKind, TokenKind};
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
use super::*; use super::*;
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
/// Convert the tokenised search query into a list of located query terms. /// Convert the tokenised search query into a list of located query terms.
// TODO: checking if the positions are correct for phrases, separators, ngrams // TODO: checking if the positions are correct for phrases, separators, ngrams
@ -51,6 +51,7 @@ pub fn located_query_terms_from_string(
word, word,
nbr_typos(word), nbr_typos(word),
false, false,
false,
)?; )?;
let located_term = LocatedQueryTerm { let located_term = LocatedQueryTerm {
value: ctx.term_interner.push(term), value: ctx.term_interner.push(term),
@ -62,8 +63,13 @@ pub fn located_query_terms_from_string(
} }
} else { } else {
let word = token.lemma(); let word = token.lemma();
let term = let term = partially_initialized_term_from_word(
partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; ctx,
word,
nbr_typos(word),
true,
false,
)?;
let located_term = LocatedQueryTerm { let located_term = LocatedQueryTerm {
value: ctx.term_interner.push(term), value: ctx.term_interner.push(term),
positions: position..=position, positions: position..=position,
@ -195,7 +201,8 @@ pub fn make_ngram(
let max_nbr_typos = let max_nbr_typos =
number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);
let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; let mut term =
partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?;
// Now add the synonyms // Now add the synonyms
let index_synonyms = ctx.index.synonyms(ctx.txn)?; let index_synonyms = ctx.index.synonyms(ctx.txn)?;

View File

@ -1,10 +1,10 @@
use heed::BytesDecode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraphTrait}; use super::{ComputedCondition, RankingRuleGraphTrait};
use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
use crate::{Result, RoaringBitmapCodec, SearchContext}; use crate::search::new::Word;
use crate::{Result, SearchContext};
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub enum ExactnessCondition { pub enum ExactnessCondition {
@ -27,8 +27,8 @@ fn compute_docids(
let mut candidates = match exact_term { let mut candidates = match exact_term {
ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(),
ExactTerm::Word(word) => { ExactTerm::Word(word) => {
if let Some(word_candidates) = ctx.get_db_word_docids(word)? { if let Some(word_candidates) = ctx.word_docids(Word::Original(word))? {
RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)? word_candidates
} else { } else {
return Ok(Default::default()); return Ok(Default::default());
} }

View File

@ -2,7 +2,6 @@
use std::collections::BTreeSet; use std::collections::BTreeSet;
use heed::BytesDecode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::ProximityCondition; use super::ProximityCondition;
@ -10,8 +9,8 @@ use crate::search::new::interner::Interned;
use crate::search::new::query_term::{Phrase, QueryTermSubset}; use crate::search::new::query_term::{Phrase, QueryTermSubset};
use crate::search::new::ranking_rule_graph::ComputedCondition; use crate::search::new::ranking_rule_graph::ComputedCondition;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::search::new::SearchContext; use crate::search::new::{SearchContext, Word};
use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; use crate::Result;
pub fn compute_docids( pub fn compute_docids(
ctx: &mut SearchContext, ctx: &mut SearchContext,
@ -55,8 +54,8 @@ pub fn compute_docids(
{ {
compute_prefix_edges( compute_prefix_edges(
ctx, ctx,
left_word, left_word.interned(),
right_prefix, right_prefix.interned(),
left_phrase, left_phrase,
forward_proximity, forward_proximity,
backward_proximity, backward_proximity,
@ -92,9 +91,7 @@ pub fn compute_docids(
if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) { if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
continue; continue;
} }
} else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? { } else if let Some(left_word_docids) = ctx.word_docids(left_word)? {
let left_word_docids =
RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?;
if universe.is_disjoint(&left_word_docids) { if universe.is_disjoint(&left_word_docids) {
continue; continue;
} }
@ -104,7 +101,7 @@ pub fn compute_docids(
for (right_word, right_phrase) in right_derivs { for (right_word, right_phrase) in right_derivs {
compute_non_prefix_edges( compute_non_prefix_edges(
ctx, ctx,
left_word, left_word.interned(),
right_word, right_word,
left_phrase, left_phrase,
right_phrase, right_phrase,
@ -155,7 +152,7 @@ fn compute_prefix_edges(
if let Some(new_docids) = if let Some(new_docids) =
ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)? ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)?
{ {
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; let new_docids = &universe & new_docids;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_left_words.insert(left_word); used_left_words.insert(left_word);
used_right_prefix.insert(right_prefix); used_right_prefix.insert(right_prefix);
@ -170,7 +167,7 @@ fn compute_prefix_edges(
left_word, left_word,
backward_proximity, backward_proximity,
)? { )? {
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; let new_docids = &universe & new_docids;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_left_words.insert(left_word); used_left_words.insert(left_word);
used_right_prefix.insert(right_prefix); used_right_prefix.insert(right_prefix);
@ -217,7 +214,7 @@ fn compute_non_prefix_edges(
if let Some(new_docids) = if let Some(new_docids) =
ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)? ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
{ {
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; let new_docids = &universe & new_docids;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_left_words.insert(word1); used_left_words.insert(word1);
used_right_words.insert(word2); used_right_words.insert(word2);
@ -231,7 +228,7 @@ fn compute_non_prefix_edges(
if let Some(new_docids) = if let Some(new_docids) =
ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)? ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
{ {
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; let new_docids = &universe & new_docids;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_left_words.insert(word2); used_left_words.insert(word2);
used_right_words.insert(word1); used_right_words.insert(word1);
@ -246,7 +243,7 @@ fn compute_non_prefix_edges(
fn last_words_of_term_derivations( fn last_words_of_term_derivations(
ctx: &mut SearchContext, ctx: &mut SearchContext,
t: &QueryTermSubset, t: &QueryTermSubset,
) -> Result<BTreeSet<(Option<Interned<Phrase>>, Interned<String>)>> { ) -> Result<BTreeSet<(Option<Interned<Phrase>>, Word)>> {
let mut result = BTreeSet::new(); let mut result = BTreeSet::new();
for w in t.all_single_words_except_prefix_db(ctx)? { for w in t.all_single_words_except_prefix_db(ctx)? {
@ -256,7 +253,7 @@ fn last_words_of_term_derivations(
let phrase = ctx.phrase_interner.get(p); let phrase = ctx.phrase_interner.get(p);
let last_term_of_phrase = phrase.words.last().unwrap(); let last_term_of_phrase = phrase.words.last().unwrap();
if let Some(last_word) = last_term_of_phrase { if let Some(last_word) = last_term_of_phrase {
result.insert((Some(p), *last_word)); result.insert((Some(p), Word::Original(*last_word)));
} }
} }
@ -269,7 +266,7 @@ fn first_word_of_term_iter(
let mut result = BTreeSet::new(); let mut result = BTreeSet::new();
let all_words = t.all_single_words_except_prefix_db(ctx)?; let all_words = t.all_single_words_except_prefix_db(ctx)?;
for w in all_words { for w in all_words {
result.insert((w, None)); result.insert((w.interned(), None));
} }
for p in t.all_phrases(ctx)? { for p in t.all_phrases(ctx)? {
let phrase = ctx.phrase_interner.get(p); let phrase = ctx.phrase_interner.get(p);

View File

@ -3,16 +3,15 @@
use std::collections::VecDeque; use std::collections::VecDeque;
use fxhash::FxHashMap; use fxhash::FxHashMap;
use heed::BytesDecode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::interner::Interned; use super::interner::Interned;
use super::query_graph::QueryNodeData; use super::query_graph::QueryNodeData;
use super::query_term::{Phrase, QueryTermSubset}; use super::query_term::{Phrase, QueryTermSubset};
use super::small_bitmap::SmallBitmap; use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, SearchContext}; use super::{QueryGraph, SearchContext, Word};
use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; use crate::Result;
#[derive(Default)] #[derive(Default)]
pub struct PhraseDocIdsCache { pub struct PhraseDocIdsCache {
@ -36,8 +35,8 @@ pub fn compute_query_term_subset_docids(
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? { for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_docids) = ctx.get_db_word_docids(word)? { if let Some(word_docids) = ctx.word_docids(word)? {
docids |= RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?; docids |= word_docids;
} }
} }
for phrase in term.all_phrases(ctx)? { for phrase in term.all_phrases(ctx)? {
@ -45,9 +44,8 @@ pub fn compute_query_term_subset_docids(
} }
if let Some(prefix) = term.use_prefix_db(ctx) { if let Some(prefix) = term.use_prefix_db(ctx) {
if let Some(prefix_docids) = ctx.get_db_word_prefix_docids(prefix)? { if let Some(prefix_docids) = ctx.word_prefix_docids(prefix)? {
docids |= docids |= prefix_docids;
RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?;
} }
} }
@ -127,9 +125,8 @@ pub fn compute_phrase_docids(
} }
if words.len() == 1 { if words.len() == 1 {
if let Some(word) = &words[0] { if let Some(word) = &words[0] {
if let Some(word_docids) = ctx.get_db_word_docids(*word)? { if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? {
return RoaringBitmapCodec::bytes_decode(word_docids) return Ok(word_docids);
.ok_or(heed::Error::Decoding.into());
} else { } else {
return Ok(RoaringBitmap::new()); return Ok(RoaringBitmap::new());
} }
@ -158,7 +155,7 @@ pub fn compute_phrase_docids(
{ {
if dist == 0 { if dist == 0 {
match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? { match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? {
Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), Some(m) => bitmaps.push(m),
// If there are no documents for this pair, there will be no // If there are no documents for this pair, there will be no
// results for the phrase query. // results for the phrase query.
None => return Ok(RoaringBitmap::new()), None => return Ok(RoaringBitmap::new()),
@ -169,7 +166,7 @@ pub fn compute_phrase_docids(
if let Some(m) = if let Some(m) =
ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)? ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)?
{ {
bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; bitmap |= m;
} }
} }
if bitmap.is_empty() { if bitmap.is_empty() {