From 244003e36f4ef872f6b96bdb1d870a5b344c9d18 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 15:26:47 +0200 Subject: [PATCH 1/7] Refactor DB cache to return Roaring Bitmaps directly instead of byte slices --- milli/src/search/new/db_cache.rs | 72 +++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index a0dde4686..c1862244a 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -4,10 +4,13 @@ use std::hash::Hash; use fxhash::FxHashMap; use heed::types::ByteSlice; -use heed::{BytesEncode, Database, RoTxn}; +use heed::{BytesDecode, BytesEncode, Database, RoTxn}; +use roaring::RoaringBitmap; use super::interner::Interned; -use crate::{Result, SearchContext}; +use crate::{ + CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, +}; /// A cache storing pointers to values in the LMDB databases. /// @@ -65,27 +68,31 @@ impl<'ctx> SearchContext<'ctx> { } /// Retrieve or insert the given value in the `word_docids` database. - pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { + pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { DatabaseCache::get_value( self.txn, word, self.word_interner.get(word).as_str(), &mut self.db_cache.word_docids, self.index.word_docids.remap_data_type::(), - ) + )? + .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } /// Retrieve or insert the given value in the `word_prefix_docids` database. pub fn get_db_word_prefix_docids( &mut self, prefix: Interned, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, prefix, self.word_interner.get(prefix).as_str(), &mut self.db_cache.word_prefix_docids, self.index.word_prefix_docids.remap_data_type::(), - ) + )? + .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } pub fn get_db_word_pair_proximity_docids( @@ -93,7 +100,7 @@ impl<'ctx> SearchContext<'ctx> { word1: Interned, word2: Interned, proximity: u8, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (proximity, word1, word2), @@ -104,7 +111,32 @@ impl<'ctx> SearchContext<'ctx> { ), &mut self.db_cache.word_pair_proximity_docids, self.index.word_pair_proximity_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() + } + + pub fn get_db_word_pair_proximity_docids_len( + &mut self, + word1: Interned, + word2: Interned, + proximity: u8, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (proximity, word1, word2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(word2).as_str(), + ), + &mut self.db_cache.word_pair_proximity_docids, + self.index.word_pair_proximity_docids.remap_data_type::(), + )? + .map(|bytes| { + CboRoaringBitmapLenCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()) + }) + .transpose() } pub fn get_db_word_prefix_pair_proximity_docids( @@ -112,7 +144,7 @@ impl<'ctx> SearchContext<'ctx> { word1: Interned, prefix2: Interned, proximity: u8, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (proximity, word1, prefix2), @@ -123,14 +155,16 @@ impl<'ctx> SearchContext<'ctx> { ), &mut self.db_cache.word_prefix_pair_proximity_docids, self.index.word_prefix_pair_proximity_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } pub fn get_db_prefix_word_pair_proximity_docids( &mut self, left_prefix: Interned, right: Interned, proximity: u8, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (proximity, left_prefix, right), @@ -141,34 +175,40 @@ impl<'ctx> SearchContext<'ctx> { ), &mut self.db_cache.prefix_word_pair_proximity_docids, self.index.prefix_word_pair_proximity_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } pub fn get_db_word_position_docids( &mut self, word: Interned, position: u16, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (word, position), &(self.word_interner.get(word).as_str(), position), &mut self.db_cache.word_position_docids, self.index.word_position_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } pub fn get_db_word_fid_docids( &mut self, word: Interned, fid: u16, - ) -> Result> { + ) -> Result> { DatabaseCache::get_value( self.txn, (word, fid), &(self.word_interner.get(word).as_str(), fid), &mut self.db_cache.word_fid_docids, self.index.word_fid_docids.remap_data_type::(), - ) + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() } } From e7ff987c46ee5d28caeb7c2c53435a3f4524510f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 15:31:40 +0200 Subject: [PATCH 2/7] Update call sites --- milli/src/search/new/exact_attribute.rs | 15 ++++----------- .../search/new/query_term/compute_derivations.rs | 14 ++++++-------- .../new/ranking_rule_graph/exactness/mod.rs | 3 +-- .../proximity/compute_docids.rs | 15 ++++++--------- milli/src/search/new/resolve_query_graph.rs | 15 ++++++--------- 5 files changed, 23 insertions(+), 39 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 3a31f6a75..bc0195ebc 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -1,11 +1,10 @@ -use heed::BytesDecode; use roaring::{MultiOps, RoaringBitmap}; use super::query_graph::QueryGraph; use super::ranking_rules::{RankingRule, RankingRuleOutput}; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::ExactTerm; -use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; +use crate::{Result, SearchContext, SearchLogger}; /// A ranking rule that produces 3 disjoint buckets: /// @@ -161,10 +160,8 @@ impl State { // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of // longer phrases we'll be losing on precision here. let bucketed_position = crate::bucketed_position(position + offset); - let word_position_docids = CboRoaringBitmapCodec::bytes_decode( - ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(), - ) - .unwrap_or_default(); + let word_position_docids = + ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(); candidates &= word_position_docids; if candidates.is_empty() { return Ok(State::Empty(query_graph.clone())); @@ -191,11 +188,7 @@ impl State { // ignore stop words words in phrases .flatten() .map(|word| -> Result<_> { - Ok(ctx - .get_db_word_fid_docids(*word, fid)? - .map(CboRoaringBitmapCodec::bytes_decode) - .unwrap_or_default() - .unwrap_or_default()) + Ok(ctx.get_db_word_fid_docids(*word, fid)?.unwrap_or_default()) }), )?; intersection &= &candidates; diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index 03d92572e..12b8c3832 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -1,17 +1,17 @@ -use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; -use heed::types::DecodeIgnore; -use heed::BytesDecode; use std::borrow::Cow; use std::collections::BTreeSet; use std::ops::ControlFlow; +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; +use heed::types::DecodeIgnore; + use super::*; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::new::query_term::TwoTypoTerm; use crate::search::new::{limits, SearchContext}; use crate::search::{build_dfa, get_first}; -use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; +use crate::{Result, MAX_WORD_LENGTH}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum NumberOfTypos { @@ -385,9 +385,7 @@ fn split_best_frequency( let left = ctx.word_interner.insert(left.to_owned()); let right = ctx.word_interner.insert(right.to_owned()); - if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { - let frequency = - CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; + if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? { if best.map_or(true, |(old, _, _)| frequency > old) { best = Some((frequency, left, right)); } diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 55c4497dd..4a3dd6549 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -1,4 +1,3 @@ -use heed::BytesDecode; use roaring::RoaringBitmap; use super::{ComputedCondition, RankingRuleGraphTrait}; @@ -28,7 +27,7 @@ fn compute_docids( ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), ExactTerm::Word(word) => { if let Some(word_candidates) = ctx.get_db_word_docids(word)? { - RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)? + word_candidates } else { return Ok(Default::default()); } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 07bd102ca..b6f164f16 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -2,7 +2,6 @@ use std::collections::BTreeSet; -use heed::BytesDecode; use roaring::RoaringBitmap; use super::ProximityCondition; @@ -11,7 +10,7 @@ use crate::search::new::query_term::{Phrase, QueryTermSubset}; use crate::search::new::ranking_rule_graph::ComputedCondition; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; +use crate::Result; pub fn compute_docids( ctx: &mut SearchContext, @@ -92,9 +91,7 @@ pub fn compute_docids( if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) { continue; } - } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? { - let left_word_docids = - RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?; + } else if let Some(left_word_docids) = ctx.get_db_word_docids(left_word)? { if universe.is_disjoint(&left_word_docids) { continue; } @@ -155,7 +152,7 @@ fn compute_prefix_edges( if let Some(new_docids) = ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)? { - let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(left_word); used_right_prefix.insert(right_prefix); @@ -170,7 +167,7 @@ fn compute_prefix_edges( left_word, backward_proximity, )? { - let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(left_word); used_right_prefix.insert(right_prefix); @@ -217,7 +214,7 @@ fn compute_non_prefix_edges( if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)? { - let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(word1); used_right_words.insert(word2); @@ -231,7 +228,7 @@ fn compute_non_prefix_edges( if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)? { - let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(word2); used_right_words.insert(word1); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index ef7adad14..bca8b6268 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -3,7 +3,6 @@ use std::collections::VecDeque; use fxhash::FxHashMap; -use heed::BytesDecode; use roaring::RoaringBitmap; use super::interner::Interned; @@ -12,7 +11,7 @@ use super::query_term::{Phrase, QueryTermSubset}; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, SearchContext}; use crate::search::new::query_term::LocatedQueryTermSubset; -use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; +use crate::Result; #[derive(Default)] pub struct PhraseDocIdsCache { @@ -37,7 +36,7 @@ pub fn compute_query_term_subset_docids( let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { if let Some(word_docids) = ctx.get_db_word_docids(word)? { - docids |= RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?; + docids |= word_docids; } } for phrase in term.all_phrases(ctx)? { @@ -46,8 +45,7 @@ pub fn compute_query_term_subset_docids( if let Some(prefix) = term.use_prefix_db(ctx) { if let Some(prefix_docids) = ctx.get_db_word_prefix_docids(prefix)? { - docids |= - RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?; + docids |= prefix_docids; } } @@ -128,8 +126,7 @@ pub fn compute_phrase_docids( if words.len() == 1 { if let Some(word) = &words[0] { if let Some(word_docids) = ctx.get_db_word_docids(*word)? { - return RoaringBitmapCodec::bytes_decode(word_docids) - .ok_or(heed::Error::Decoding.into()); + return Ok(word_docids); } else { return Ok(RoaringBitmap::new()); } @@ -158,7 +155,7 @@ pub fn compute_phrase_docids( { if dist == 0 { match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? { - Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), + Some(m) => bitmaps.push(m), // If there are no documents for this pair, there will be no // results for the phrase query. None => return Ok(RoaringBitmap::new()), @@ -169,7 +166,7 @@ pub fn compute_phrase_docids( if let Some(m) = ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; + bitmap |= m; } } if bitmap.is_empty() { From 325f17488aa142d634b3aa3eb537b5eeee3e0b9c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 18:26:44 +0200 Subject: [PATCH 3/7] Add SearchContext::word_docids() method --- milli/src/search/new/db_cache.rs | 37 ++++++++++++++++++++++++++++++++ milli/src/search/new/mod.rs | 15 +++++++++++++ 2 files changed, 52 insertions(+) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index c1862244a..09845377c 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -8,6 +8,7 @@ use heed::{BytesDecode, BytesEncode, Database, RoTxn}; use roaring::RoaringBitmap; use super::interner::Interned; +use super::Word; use crate::{ CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, }; @@ -67,6 +68,26 @@ impl<'ctx> SearchContext<'ctx> { } } + pub fn word_docids(&mut self, word: Word) -> Result> { + match word { + Word::Original(word) => { + let exact = self.get_db_exact_word_docids(word)?; + let tolerant = self.get_db_word_docids(word)?; + Ok(match (exact, tolerant) { + (None, None) => None, + (None, Some(tolerant)) => Some(tolerant), + (Some(exact), None) => Some(exact), + (Some(exact), Some(tolerant)) => { + let mut both = exact; + both |= tolerant; + Some(both) + } + }) + } + Word::Derived(word) => self.get_db_word_docids(word), + } + } + /// Retrieve or insert the given value in the `word_docids` database. pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { DatabaseCache::get_value( @@ -79,6 +100,22 @@ impl<'ctx> SearchContext<'ctx> { .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) .transpose() } + + fn get_db_exact_word_docids( + &mut self, + word: Interned, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + word, + self.word_interner.get(word).as_str(), + &mut self.db_cache.exact_word_docids, + self.index.exact_word_docids.remap_data_type::(), + )? + .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() + } + /// Retrieve or insert the given value in the `word_prefix_docids` database. pub fn get_db_word_prefix_docids( &mut self, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 7b15bcaab..f51d3771d 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -75,6 +75,21 @@ impl<'ctx> SearchContext<'ctx> { } } +#[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)] +pub enum Word { + Original(Interned), + Derived(Interned), +} + +impl Word { + pub fn interned(&self) -> Interned { + match self { + Word::Original(word) => *word, + Word::Derived(word) => *word, + } + } +} + /// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it. #[allow(clippy::too_many_arguments)] fn resolve_maximally_reduced_query_graph( From 5ab46324c4fc4bcce2d7b250f82ad4175a1bb6b0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 18:27:41 +0200 Subject: [PATCH 4/7] Everyone uses the SearchContext::word_docids instead of get_db_word_docids make get_db_word_docids private --- milli/src/search/new/db_cache.rs | 2 +- milli/src/search/new/logger/visual.rs | 2 +- milli/src/search/new/mod.rs | 2 + milli/src/search/new/query_term/mod.rs | 42 +++++++++++++------ .../new/ranking_rule_graph/exactness/mod.rs | 5 ++- .../proximity/compute_docids.rs | 14 +++---- milli/src/search/new/resolve_query_graph.rs | 6 +-- 7 files changed, 46 insertions(+), 27 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 09845377c..aa1c11773 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -89,7 +89,7 @@ impl<'ctx> SearchContext<'ctx> { } /// Retrieve or insert the given value in the `word_docids` database. - pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { + fn get_db_word_docids(&mut self, word: Interned) -> Result> { DatabaseCache::get_value( self.txn, word, diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index 17f7ef76c..068b5ad68 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -427,7 +427,7 @@ fill: \"#B6E2D3\" )?; for w in term_subset.all_single_words_except_prefix_db(ctx)? { - let w = ctx.word_interner.get(w); + let w = ctx.word_interner.get(w.interned()); writeln!(file, "{w}: word")?; } for p in term_subset.all_phrases(ctx)? { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index f51d3771d..9f8d8699f 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -50,6 +50,8 @@ use ranking_rules::{BoxRankingRule, RankingRule}; use resolve_query_graph::compute_query_graph_docids; use sort::Sort; +use self::interner::Interned; + /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { pub index: &'ctx Index, diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 83320139b..0a0d1a7eb 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -3,18 +3,18 @@ mod ntypo_subset; mod parse_query; mod phrase; -use super::interner::{DedupInterner, Interned}; -use super::{limits, SearchContext}; -use crate::Result; use std::collections::BTreeSet; use std::ops::RangeInclusive; +use compute_derivations::partially_initialized_term_from_word; use either::Either; pub use ntypo_subset::NTypoTermSubset; pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed}; pub use phrase::Phrase; -use compute_derivations::partially_initialized_term_from_word; +use super::interner::{DedupInterner, Interned}; +use super::{limits, SearchContext, Word}; +use crate::Result; /// A set of word derivations attached to a location in the search query. #[derive(Clone, PartialEq, Eq, Hash)] @@ -180,7 +180,7 @@ impl QueryTermSubset { pub fn all_single_words_except_prefix_db( &self, ctx: &mut SearchContext, - ) -> Result>> { + ) -> Result> { let mut result = BTreeSet::default(); // TODO: a compute_partially funtion if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { @@ -197,8 +197,20 @@ impl QueryTermSubset { synonyms: _, use_prefix_db: _, } = &original.zero_typo; - result.extend(zero_typo.iter().copied()); - result.extend(prefix_of.iter().copied()); + result.extend(zero_typo.iter().copied().map(|w| { + if original.ngram_words.is_some() { + Word::Derived(w) + } else { + Word::Original(w) + } + })); + result.extend(prefix_of.iter().copied().map(|w| { + if original.ngram_words.is_some() { + Word::Derived(w) + } else { + Word::Original(w) + } + })); } NTypoTermSubset::Subset { words, phrases: _ } => { let ZeroTypoTerm { @@ -210,10 +222,14 @@ impl QueryTermSubset { } = &original.zero_typo; if let Some(zero_typo) = zero_typo { if words.contains(zero_typo) { - result.insert(*zero_typo); + if original.ngram_words.is_some() { + result.insert(Word::Derived(*zero_typo)); + } else { + result.insert(Word::Original(*zero_typo)); + } } } - result.extend(prefix_of.intersection(words).copied()); + result.extend(prefix_of.intersection(words).copied().map(Word::Derived)); } NTypoTermSubset::Nothing => {} } @@ -223,13 +239,13 @@ impl QueryTermSubset { let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { panic!() }; - result.extend(one_typo.iter().copied()) + result.extend(one_typo.iter().copied().map(Word::Derived)) } NTypoTermSubset::Subset { words, phrases: _ } => { let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { panic!() }; - result.extend(one_typo.intersection(words)); + result.extend(one_typo.intersection(words).copied().map(Word::Derived)); } NTypoTermSubset::Nothing => {} }; @@ -239,13 +255,13 @@ impl QueryTermSubset { let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { panic!() }; - result.extend(two_typos.iter().copied()); + result.extend(two_typos.iter().copied().map(Word::Derived)); } NTypoTermSubset::Subset { words, phrases: _ } => { let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { panic!() }; - result.extend(two_typos.intersection(words)); + result.extend(two_typos.intersection(words).copied().map(Word::Derived)); } NTypoTermSubset::Nothing => {} }; diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 4a3dd6549..7455a7a17 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -3,7 +3,8 @@ use roaring::RoaringBitmap; use super::{ComputedCondition, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; -use crate::{Result, RoaringBitmapCodec, SearchContext}; +use crate::search::new::Word; +use crate::{Result, SearchContext}; #[derive(Clone, PartialEq, Eq, Hash)] pub enum ExactnessCondition { @@ -26,7 +27,7 @@ fn compute_docids( let mut candidates = match exact_term { ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), ExactTerm::Word(word) => { - if let Some(word_candidates) = ctx.get_db_word_docids(word)? { + if let Some(word_candidates) = ctx.word_docids(Word::Original(word))? { word_candidates } else { return Ok(Default::default()); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index b6f164f16..760c7272c 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -9,7 +9,7 @@ use crate::search::new::interner::Interned; use crate::search::new::query_term::{Phrase, QueryTermSubset}; use crate::search::new::ranking_rule_graph::ComputedCondition; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; -use crate::search::new::SearchContext; +use crate::search::new::{SearchContext, Word}; use crate::Result; pub fn compute_docids( @@ -54,7 +54,7 @@ pub fn compute_docids( { compute_prefix_edges( ctx, - left_word, + left_word.interned(), right_prefix, left_phrase, forward_proximity, @@ -91,7 +91,7 @@ pub fn compute_docids( if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) { continue; } - } else if let Some(left_word_docids) = ctx.get_db_word_docids(left_word)? { + } else if let Some(left_word_docids) = ctx.word_docids(left_word)? { if universe.is_disjoint(&left_word_docids) { continue; } @@ -101,7 +101,7 @@ pub fn compute_docids( for (right_word, right_phrase) in right_derivs { compute_non_prefix_edges( ctx, - left_word, + left_word.interned(), right_word, left_phrase, right_phrase, @@ -243,7 +243,7 @@ fn compute_non_prefix_edges( fn last_words_of_term_derivations( ctx: &mut SearchContext, t: &QueryTermSubset, -) -> Result>, Interned)>> { +) -> Result>, Word)>> { let mut result = BTreeSet::new(); for w in t.all_single_words_except_prefix_db(ctx)? { @@ -253,7 +253,7 @@ fn last_words_of_term_derivations( let phrase = ctx.phrase_interner.get(p); let last_term_of_phrase = phrase.words.last().unwrap(); if let Some(last_word) = last_term_of_phrase { - result.insert((Some(p), *last_word)); + result.insert((Some(p), Word::Original(*last_word))); } } @@ -266,7 +266,7 @@ fn first_word_of_term_iter( let mut result = BTreeSet::new(); let all_words = t.all_single_words_except_prefix_db(ctx)?; for w in all_words { - result.insert((w, None)); + result.insert((w.interned(), None)); } for p in t.all_phrases(ctx)? { let phrase = ctx.phrase_interner.get(p); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index bca8b6268..c78f0c5ee 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -9,7 +9,7 @@ use super::interner::Interned; use super::query_graph::QueryNodeData; use super::query_term::{Phrase, QueryTermSubset}; use super::small_bitmap::SmallBitmap; -use super::{QueryGraph, SearchContext}; +use super::{QueryGraph, SearchContext, Word}; use crate::search::new::query_term::LocatedQueryTermSubset; use crate::Result; @@ -35,7 +35,7 @@ pub fn compute_query_term_subset_docids( ) -> Result { let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { - if let Some(word_docids) = ctx.get_db_word_docids(word)? { + if let Some(word_docids) = ctx.word_docids(word)? { docids |= word_docids; } } @@ -125,7 +125,7 @@ pub fn compute_phrase_docids( } if words.len() == 1 { if let Some(word) = &words[0] { - if let Some(word_docids) = ctx.get_db_word_docids(*word)? { + if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? { return Ok(word_docids); } else { return Ok(RoaringBitmap::new()); From c20c38a7fa37cf9babc79018f1410958ca329f07 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 22:04:38 +0200 Subject: [PATCH 5/7] Add SearchContext::word_prefix_docids() method --- milli/src/search/new/db_cache.rs | 36 ++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index aa1c11773..fb36c0d9f 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -29,6 +29,7 @@ pub struct DatabaseCache<'ctx> { pub word_docids: FxHashMap, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, + pub exact_word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, pub words_fst: Option>>, pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, @@ -116,6 +117,26 @@ impl<'ctx> SearchContext<'ctx> { .transpose() } + pub fn word_prefix_docids(&mut self, prefix: Word) -> Result> { + match prefix { + Word::Original(prefix) => { + let exact = self.get_db_exact_word_prefix_docids(prefix)?; + let tolerant = self.get_db_word_prefix_docids(prefix)?; + Ok(match (exact, tolerant) { + (None, None) => None, + (None, Some(tolerant)) => Some(tolerant), + (Some(exact), None) => Some(exact), + (Some(exact), Some(tolerant)) => { + let mut both = exact; + both |= tolerant; + Some(both) + } + }) + } + Word::Derived(prefix) => self.get_db_word_prefix_docids(prefix), + } + } + /// Retrieve or insert the given value in the `word_prefix_docids` database. pub fn get_db_word_prefix_docids( &mut self, @@ -132,6 +153,21 @@ impl<'ctx> SearchContext<'ctx> { .transpose() } + fn get_db_exact_word_prefix_docids( + &mut self, + prefix: Interned, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + prefix, + self.word_interner.get(prefix).as_str(), + &mut self.db_cache.exact_word_prefix_docids, + self.index.exact_word_prefix_docids.remap_data_type::(), + )? + .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() + } + pub fn get_db_word_pair_proximity_docids( &mut self, word1: Interned, From 7a01f20df746997886aa57a1e74d9f9ffef82a5d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Apr 2023 22:06:10 +0200 Subject: [PATCH 6/7] Use word_prefix_docids, make get_word_prefix_docids private --- milli/src/search/new/db_cache.rs | 2 +- milli/src/search/new/logger/visual.rs | 2 +- milli/src/search/new/query_term/mod.rs | 13 ++++++++++--- .../ranking_rule_graph/proximity/compute_docids.rs | 2 +- milli/src/search/new/resolve_query_graph.rs | 2 +- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index fb36c0d9f..6193f4c58 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -138,7 +138,7 @@ impl<'ctx> SearchContext<'ctx> { } /// Retrieve or insert the given value in the `word_prefix_docids` database. - pub fn get_db_word_prefix_docids( + fn get_db_word_prefix_docids( &mut self, prefix: Interned, ) -> Result> { diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index 068b5ad68..72e33f339 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -434,7 +434,7 @@ fill: \"#B6E2D3\" writeln!(file, "{}: phrase", p.description(ctx))?; } if let Some(w) = term_subset.use_prefix_db(ctx) { - let w = ctx.word_interner.get(w); + let w = ctx.word_interner.get(w.interned()); writeln!(file, "{w}: prefix db")?; } diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 0a0d1a7eb..d8c2bb0c7 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -159,12 +159,12 @@ impl QueryTermSubset { self.two_typo_subset.intersect(&other.two_typo_subset); } - pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { + pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option { let original = ctx.term_interner.get(self.original); let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { return None }; - match &self.zero_typo_subset { + let word = match &self.zero_typo_subset { NTypoTermSubset::All => Some(use_prefix_db), NTypoTermSubset::Subset { words, phrases: _ } => { // TODO: use a subset of prefix words instead @@ -175,7 +175,14 @@ impl QueryTermSubset { } } NTypoTermSubset::Nothing => None, - } + }; + word.map(|word| { + if original.ngram_words.is_some() { + Word::Derived(word) + } else { + Word::Original(word) + } + }) } pub fn all_single_words_except_prefix_db( &self, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 760c7272c..3e75f948e 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -55,7 +55,7 @@ pub fn compute_docids( compute_prefix_edges( ctx, left_word.interned(), - right_prefix, + right_prefix.interned(), left_phrase, forward_proximity, backward_proximity, diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index c78f0c5ee..f4938ca12 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -44,7 +44,7 @@ pub fn compute_query_term_subset_docids( } if let Some(prefix) = term.use_prefix_db(ctx) { - if let Some(prefix_docids) = ctx.get_db_word_prefix_docids(prefix)? { + if let Some(prefix_docids) = ctx.word_prefix_docids(prefix)? { docids |= prefix_docids; } } From 38b7b31beb4e1e0631aee05f97367d38593508d9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Apr 2023 15:14:00 +0200 Subject: [PATCH 7/7] Decide to use prefix DB if the word is not an ngram --- .../new/query_term/compute_derivations.rs | 12 ++++++++++-- .../src/search/new/query_term/parse_query.rs | 19 +++++++++++++------ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index 12b8c3832..0da841890 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -177,6 +177,7 @@ pub fn partially_initialized_term_from_word( word: &str, max_typo: u8, is_prefix: bool, + is_ngram: bool, ) -> Result { let word_interned = ctx.word_interner.insert(word.to_owned()); @@ -197,12 +198,19 @@ pub fn partially_initialized_term_from_word( let fst = ctx.index.words_fst(ctx.txn)?; let use_prefix_db = is_prefix - && ctx + && (ctx .index .word_prefix_docids .remap_data_type::() .get(ctx.txn, word)? - .is_some(); + .is_some() + || (!is_ngram + && ctx + .index + .exact_word_prefix_docids + .remap_data_type::() + .get(ctx.txn, word)? + .is_some())); let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; let mut zero_typo = None; diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 5663f6b4b..91b888dcf 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -1,8 +1,8 @@ -use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind}; - -use crate::{Result, SearchContext, MAX_WORD_LENGTH}; +use charabia::normalizer::NormalizedTokenIter; +use charabia::{SeparatorKind, TokenKind}; use super::*; +use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. // TODO: checking if the positions are correct for phrases, separators, ngrams @@ -51,6 +51,7 @@ pub fn located_query_terms_from_string( word, nbr_typos(word), false, + false, )?; let located_term = LocatedQueryTerm { value: ctx.term_interner.push(term), @@ -62,8 +63,13 @@ pub fn located_query_terms_from_string( } } else { let word = token.lemma(); - let term = - partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; + let term = partially_initialized_term_from_word( + ctx, + word, + nbr_typos(word), + true, + false, + )?; let located_term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: position..=position, @@ -195,7 +201,8 @@ pub fn make_ngram( let max_nbr_typos = number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); - let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; + let mut term = + partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?; // Now add the synonyms let index_synonyms = ctx.index.synonyms(ctx.txn)?;