From 406b8bd2489931afb2373ed60011e73d9530922e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:04:46 +0200 Subject: [PATCH 01/10] Add new db caches --- milli/src/search/new/db_cache.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index af94108e2..effd123be 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -24,6 +24,8 @@ pub struct DatabaseCache<'ctx> { pub word_docids: FxHashMap, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, + pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, } impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( @@ -128,4 +130,32 @@ impl<'ctx> SearchContext<'ctx> { self.index.prefix_word_pair_proximity_docids.remap_data_type::(), ) } + + pub fn get_db_word_position_docids( + &mut self, + word: Interned, + position: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, position), + &(self.word_interner.get(word).as_str(), position), + &mut self.db_cache.word_position_docids, + self.index.word_position_docids.remap_data_type::(), + ) + } + + pub fn get_db_word_fid_docids( + &mut self, + word: Interned, + fid: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, fid), + &(self.word_interner.get(word).as_str(), fid), + &mut self.db_cache.word_fid_docids, + self.index.word_fid_docids.remap_data_type::(), + ) + } } From ec2f8e804003f9ece9d49f3f616a60152b5c0ed2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:06:07 +0200 Subject: [PATCH 02/10] Rename `is_multiple_words` to `is_ngram` and `zero_typo` to `exact` --- milli/src/search/new/logger/detailed.rs | 4 ++-- milli/src/search/new/query_term.rs | 32 +++++++++++++++---------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 3a02950a8..3c4779ad9 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -441,7 +441,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ }) => { let QueryTerm { original, - is_multiple_words: _, + is_ngram: _, is_prefix: _, max_nbr_typos, zero_typo, @@ -458,7 +458,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ ) .unwrap(); - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = zero_typo; for w in zero_typo.iter().copied() { diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index d19ab6135..90b03d194 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -204,8 +204,13 @@ impl QueryTermSubset { } if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = - &original.zero_typo; + let ZeroTypoTerm { + phrase: _, + exact: zero_typo, + prefix_of, + synonyms: _, + use_prefix_db: _, + } = &original.zero_typo; result.extend(zero_typo.iter().copied()); result.extend(prefix_of.iter().copied()); }; @@ -258,7 +263,7 @@ impl QueryTermSubset { )?; } - let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = + let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } = &original.zero_typo; result.extend(phrase.iter().copied()); result.extend(synonyms.iter().copied()); @@ -302,7 +307,7 @@ impl QueryTerm { #[derive(Clone, PartialEq, Eq, Hash)] pub struct QueryTerm { pub original: Interned, - pub is_multiple_words: bool, + pub is_ngram: bool, pub max_nbr_typos: u8, pub is_prefix: bool, pub zero_typo: ZeroTypoTerm, @@ -318,7 +323,7 @@ pub struct ZeroTypoTerm { /// The original phrase, if any pub phrase: Option>, /// A single word equivalent to the original term, with zero typos - pub zero_typo: Option>, + pub exact: Option>, /// All the words that contain the original word as prefix pub prefix_of: BTreeSet>, /// All the synonyms of the original word or phrase @@ -341,7 +346,7 @@ pub struct TwoTypoTerm { impl ZeroTypoTerm { fn is_empty(&self) -> bool { - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self; phrase.is_none() && zero_typo.is_none() && prefix_of.is_empty() @@ -370,12 +375,12 @@ impl QueryTerm { ) -> Self { Self { original: word_interner.insert(phrase.description(word_interner)), - is_multiple_words: false, + is_ngram: false, max_nbr_typos: 0, is_prefix: false, zero_typo: ZeroTypoTerm { phrase: Some(phrase_interner.insert(phrase)), - zero_typo: None, + exact: None, prefix_of: BTreeSet::default(), synonyms: BTreeSet::default(), use_prefix_db: None, @@ -387,7 +392,7 @@ impl QueryTerm { pub fn empty(word_interner: &mut DedupInterner, original: &str) -> Self { Self { original: word_interner.insert(original.to_owned()), - is_multiple_words: false, + is_ngram: false, is_prefix: false, max_nbr_typos: 0, zero_typo: <_>::default(), @@ -606,11 +611,12 @@ fn partially_initialized_term_from_word( Some(ctx.phrase_interner.insert(Phrase { words })) }) .collect(); - let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; + let zero_typo = + ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db }; Ok(QueryTerm { original: word_interned, - is_multiple_words: false, + is_ngram: false, max_nbr_typos: max_typo, is_prefix, zero_typo, @@ -765,7 +771,7 @@ fn split_best_frequency( impl QueryTerm { /// Return the original word from the given query term pub fn original_single_word(&self) -> Option> { - if self.is_multiple_words { + if self.is_ngram { None } else { Some(self.original) @@ -1039,7 +1045,7 @@ pub fn make_ngram( let term = QueryTerm { original, - is_multiple_words: true, + is_ngram: true, is_prefix, max_nbr_typos, zero_typo: term.zero_typo, From 4d5bc9df4c4f3145ebc72ce73d3e51325f6fba1c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:07:26 +0200 Subject: [PATCH 03/10] Increase position by 8 on hard separator when building query terms --- milli/src/search/new/query_term.rs | 2 +- .../index_documents/extract/extract_docid_word_positions.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 90b03d194..005c0a2e3 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -907,7 +907,7 @@ pub fn located_query_terms_from_string( TokenKind::Separator(separator_kind) => { match separator_kind { SeparatorKind::Hard => { - position += 1; + position += 8; } SeparatorKind::Soft => { position += 0; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 2d51fcc1a..c362f8f1b 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -153,7 +153,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st /// take an iterator on tokens and compute their relative position depending on separator kinds /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, -/// else we keep the standart proximity of 1 between words. +/// else we keep the standard proximity of 1 between words. fn process_tokens<'a>( tokens: impl Iterator>, ) -> impl Iterator)> { From 3951fe22ab72e9d9e44498c9d95ad29a0449a8dc Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:09:32 +0200 Subject: [PATCH 04/10] Add ExactTerm and helper method --- milli/src/search/new/query_term.rs | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 005c0a2e3..4e3922980 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -4,6 +4,7 @@ use std::ops::{ControlFlow, RangeInclusive}; use charabia::normalizer::NormalizedTokenIter; use charabia::{SeparatorKind, TokenKind}; +use either::Either; use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use heed::types::DecodeIgnore; @@ -138,7 +139,43 @@ pub struct LocatedQueryTermSubset { pub term_ids: RangeInclusive, } +#[derive(Clone, Copy)] +pub enum ExactTerm { + Phrase(Interned), + Word(Interned), +} + +impl ExactTerm { + pub fn interned_words<'ctx>( + &self, + ctx: &'ctx SearchContext<'ctx>, + ) -> impl Iterator>> + 'ctx { + match *self { + ExactTerm::Phrase(phrase) => { + let phrase = ctx.phrase_interner.get(phrase); + Either::Left(phrase.words.iter().copied()) + } + ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))), + } + } +} + impl QueryTermSubset { + pub fn exact_term(&self, ctx: &SearchContext) -> Option { + let full_query_term = ctx.term_interner.get(self.original); + if full_query_term.is_ngram { + return None; + } + // TODO: included in subset + if let Some(phrase) = full_query_term.zero_typo.phrase { + self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase)) + } else if let Some(word) = full_query_term.zero_typo.exact { + self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word)) + } else { + None + } + } + pub fn empty(for_term: Interned) -> Self { Self { original: for_term, From 4b4ffb8ec993729fb53467a2899b198a14d320f9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:12:07 +0200 Subject: [PATCH 05/10] Add exactness ranking rules --- milli/src/search/new/exact_attribute.rs | 175 ++++++++++++++++++ .../search/new/graph_based_ranking_rule.rs | 10 +- milli/src/search/new/mod.rs | 12 +- .../new/ranking_rule_graph/exactness/mod.rs | 107 +++++++++++ .../src/search/new/ranking_rule_graph/mod.rs | 3 + 5 files changed, 301 insertions(+), 6 deletions(-) create mode 100644 milli/src/search/new/exact_attribute.rs create mode 100644 milli/src/search/new/ranking_rule_graph/exactness/mod.rs diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs new file mode 100644 index 000000000..bb6299e28 --- /dev/null +++ b/milli/src/search/new/exact_attribute.rs @@ -0,0 +1,175 @@ +use heed::BytesDecode; +use roaring::MultiOps; + +use super::query_graph::QueryGraph; +use super::ranking_rules::{RankingRule, RankingRuleOutput}; +use crate::search::new::query_graph::QueryNodeData; +use crate::search::new::query_term::ExactTerm; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// FIXME: +/// +/// - A lot of work done in next_bucket that start_iteration could do. +/// - Consider calling the graph based rule directly from this one. +/// - currently we did exact term, don't forget about prefix +/// - some tests +pub struct ExactAttribute { + query_graph: Option, +} + +impl ExactAttribute { + pub fn new() -> Self { + Self { query_graph: None } + } +} + +impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { + fn id(&self) -> String { + "exact_attribute".to_owned() + } + + fn start_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + _universe: &roaring::RoaringBitmap, + query: &QueryGraph, + ) -> Result<()> { + self.query_graph = Some(query.clone()); + Ok(()) + } + + fn next_bucket( + &mut self, + ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + universe: &roaring::RoaringBitmap, + ) -> Result>> { + // iterate on the nodes of the graph, retain LocatedQueryTermSubset + let query_graph = self.query_graph.as_ref().unwrap(); + let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = + Vec::with_capacity(query_graph.nodes.len() as usize); + for (_, node) in query_graph.nodes.iter() { + match &node.data { + QueryNodeData::Term(term) => { + let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { + exact_term + } else { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + }; + exact_term_position_ids.push(( + exact_term, + *term.positions.start(), + *term.term_ids.start(), + )) + } + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, + } + } + + exact_term_position_ids.sort_by_key(|(_, _, id)| *id); + // bail if there is a "hole" (missing word) in remaining query graph + let mut previous_id = 0; + for (_, _, id) in exact_term_position_ids.iter().copied() { + if id < previous_id || id - previous_id > 1 { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } else { + previous_id = id; + } + } + + // sample query: "sunflower are pretty" + // sunflower at pos 0 in attr A + // are at pos 1 in attr B + // pretty at pos 2 in attr C + // We want to eliminate such document + + // first check that for each term, there exists some attribute that has this term at the correct position + //"word-position-docids"; + let mut candidates = universe.clone(); + let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids + .iter() + .copied() + .map(|(term, position, _)| (term.interned_words(ctx).collect(), position)) + .collect(); + for (words, position) in &words_positions { + if candidates.is_empty() { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } + + 'words: for (offset, word) in words.iter().enumerate() { + let offset = offset as u16; + let word = if let Some(word) = word { + word + } else { + continue 'words; + }; + let word_position_docids = CboRoaringBitmapCodec::bytes_decode( + ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), + ) + .unwrap_or_default(); + candidates &= word_position_docids; + } + } + + let candidates = candidates; + + if candidates.is_empty() { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } + + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); + + let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); + + // then check that there exists at least one attribute that has all of the terms + for fid in searchable_fields_ids { + let mut intersection = MultiOps::intersection( + words_positions + .iter() + .flat_map(|(words, ..)| words.iter()) + // ignore stop words words in phrases + .flatten() + .map(|word| -> Result<_> { + Ok(ctx + .get_db_word_fid_docids(*word, fid)? + .map(CboRoaringBitmapCodec::bytes_decode) + .unwrap_or_default() + .unwrap_or_default()) + }), + )?; + intersection &= &candidates; + if !intersection.is_empty() { + candidates_per_attributes.push(intersection); + } + } + // note we could have "false positives" where there both exist different attributes that collectively + // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. + + let candidates = MultiOps::union(candidates_per_attributes.into_iter()); + Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + } +} diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index b8c58c726..28b4ed1f4 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner}; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, - TypoGraph, + ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; @@ -65,6 +65,12 @@ impl GraphBasedRankingRule { Self::new_with_id("typo".to_owned(), terms_matching_strategy) } } +pub type Exactness = GraphBasedRankingRule; +impl GraphBasedRankingRule { + pub fn new() -> Self { + Self::new_with_id("exactness".to_owned(), None) + } +} /// A generic graph-based ranking rule pub struct GraphBasedRankingRule { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..779e589b3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -9,8 +9,9 @@ mod query_term; mod ranking_rule_graph; mod ranking_rules; mod resolve_query_graph; -// TODO: documentation + comments mod small_bitmap; + +mod exact_attribute; // TODO: documentation + comments // implementation is currently an adaptation of the previous implementation to fit with the new model mod sort; @@ -33,6 +34,8 @@ use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; +use self::exact_attribute::ExactAttribute; +use self::graph_based_ranking_rule::Exactness; use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::resolve_query_graph::compute_query_graph_docids; @@ -150,7 +153,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut proximity = false; let mut sort = false; let attribute = false; - let exactness = false; + let mut exactness = false; let mut asc = HashSet::new(); let mut desc = HashSet::new(); @@ -211,8 +214,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if exactness { continue; } - // todo!(); - // exactness = false; + ranking_rules.push(Box::new(ExactAttribute::new())); + ranking_rules.push(Box::new(Exactness::new())); + exactness = true; } crate::Criterion::Asc(field_name) => { if asc.contains(&field_name) { diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs new file mode 100644 index 000000000..a1e19a015 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -0,0 +1,107 @@ +use roaring::RoaringBitmap; + +use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; +use crate::search::new::query_graph::{QueryGraph, QueryNode}; +use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other +/// word than a doc that matches 9 words non exactly but none exactly +/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider +/// +/// "naive vision" +/// condition from one node to another: +/// - word exactly present: cost 0 +/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside +/// +/// Three strategies: +/// 1. ExactAttribute: word position / word_fid_docid +/// 2. AttributeStart: +/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait` + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum ExactnessCondition { + ExactInAttribute(LocatedQueryTermSubset), + Skip(LocatedQueryTermSubset), +} + +pub enum ExactnessGraph {} + +fn compute_docids( + ctx: &mut SearchContext, + dest_node: &LocatedQueryTermSubset, + universe: &RoaringBitmap, +) -> Result { + let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) { + exact_term + } else { + return Ok(Default::default()); + }; + let mut candidates = match exact_term { + ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), + ExactTerm::Word(word) => { + if let Some(word_candidates) = ctx.get_db_word_docids(word)? { + CboRoaringBitmapCodec::deserialize_from(word_candidates)? + } else { + return Ok(Default::default()); + } + } + }; + // TODO: synonyms? + candidates &= universe; + Ok(candidates) +} + +impl RankingRuleGraphTrait for ExactnessGraph { + type Condition = ExactnessCondition; + + fn resolve_condition( + ctx: &mut SearchContext, + condition: &Self::Condition, + universe: &RoaringBitmap, + ) -> Result { + let (docids, dest_node) = match condition { + ExactnessCondition::ExactInAttribute(dest_node) => { + (compute_docids(ctx, dest_node, universe)?, dest_node) + } + ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node), + }; + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: dest_node.clone(), + }) + } + + fn build_edges( + _ctx: &mut SearchContext, + conditions_interner: &mut DedupInterner, + _source_node: Option<&LocatedQueryTermSubset>, + dest_node: &LocatedQueryTermSubset, + ) -> Result)>> { + let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone()); + let exact_condition = conditions_interner.insert(exact_condition); + + let skip_condition = ExactnessCondition::Skip(dest_node.clone()); + let skip_condition = conditions_interner.insert(skip_condition); + Ok(vec![(0, exact_condition), (1, skip_condition)]) + } + + fn log_state( + graph: &RankingRuleGraph, + paths: &[Vec>], + dead_ends_cache: &DeadEndsCache, + universe: &RoaringBitmap, + costs: &MappedInterner>, + cost: u64, + logger: &mut dyn SearchLogger, + ) { + todo!() + } + + fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { + todo!() + } +} diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 7c40008c8..936c3e942 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -10,6 +10,8 @@ mod cheapest_paths; mod condition_docids_cache; mod dead_ends_cache; +/// Implementation of the `exactness` ranking rule +mod exactness; /// Implementation of the `proximity` ranking rule mod proximity; /// Implementation of the `typo` ranking rule @@ -20,6 +22,7 @@ use std::hash::Hash; pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; +pub use exactness::{ExactnessCondition, ExactnessGraph}; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; From 337e75b0e4fa63566ed5ac516f504a723117ecb2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:42:51 +0200 Subject: [PATCH 06/10] Exact attribute with state --- milli/src/search/new/exact_attribute.rs | 166 +++++++++++++++++------- 1 file changed, 122 insertions(+), 44 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index bb6299e28..fa837272b 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -1,5 +1,5 @@ use heed::BytesDecode; -use roaring::MultiOps; +use roaring::{MultiOps, RoaringBitmap}; use super::query_graph::QueryGraph; use super::ranking_rules::{RankingRule, RankingRuleOutput}; @@ -7,19 +7,18 @@ use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::ExactTerm; use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; -/// FIXME: +/// A ranking rule that produces 3 disjoint buckets: /// -/// - A lot of work done in next_bucket that start_iteration could do. -/// - Consider calling the graph based rule directly from this one. -/// - currently we did exact term, don't forget about prefix -/// - some tests +/// 1. Documents from the universe whose value is exactly the query. +/// 2. Documents from the universe not in (1) whose value starts with the query. +/// 3. Documents from the universe not in (1) or (2). pub struct ExactAttribute { - query_graph: Option, + state: State, } impl ExactAttribute { pub fn new() -> Self { - Self { query_graph: None } + Self { state: Default::default() } } } @@ -30,23 +29,69 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { fn start_iteration( &mut self, - _ctx: &mut SearchContext<'ctx>, + ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, - _universe: &roaring::RoaringBitmap, + universe: &roaring::RoaringBitmap, query: &QueryGraph, ) -> Result<()> { - self.query_graph = Some(query.clone()); + self.state = State::start_iteration(ctx, universe, query)?; + Ok(()) } fn next_bucket( &mut self, - ctx: &mut SearchContext<'ctx>, + _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, universe: &roaring::RoaringBitmap, ) -> Result>> { - // iterate on the nodes of the graph, retain LocatedQueryTermSubset - let query_graph = self.query_graph.as_ref().unwrap(); + let state = std::mem::take(&mut self.state); + let (state, output) = State::next(state, universe); + self.state = state; + + Ok(output) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + self.state = Default::default(); + } +} + +/// Inner state of the ranking rule. +#[derive(Default)] +enum State { + /// State between two iterations + #[default] + Uninitialized, + /// The next call to `next` will output the documents in the universe that have an attribute that is the exact query + ExactAttribute(QueryGraph, Vec), + /// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query, + /// but isn't the exact query. + AttributeStarts(QueryGraph, Vec), + /// The next calls to `next` will output the input universe. + Empty(QueryGraph), +} + +/// The candidates sorted by attributes +/// +/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field. +struct FieldCandidates { + /// The candidates that start with all the words of the query in the field + start_with_exact: RoaringBitmap, + /// The candidates that have the same number of words as the query in the field + exact_word_count: RoaringBitmap, +} + +impl State { + fn start_iteration( + ctx: &mut SearchContext<'_>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + ) -> Result { let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = Vec::with_capacity(query_graph.nodes.len() as usize); for (_, node) in query_graph.nodes.iter() { @@ -55,11 +100,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { exact_term } else { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + continue; }; exact_term_position_ids.push(( exact_term, @@ -73,14 +114,17 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { exact_term_position_ids.sort_by_key(|(_, _, id)| *id); // bail if there is a "hole" (missing word) in remaining query graph + if let Some((_, _, first_id)) = exact_term_position_ids.first() { + if *first_id != 0 { + return Ok(State::Empty(query_graph.clone())); + } + } else { + return Ok(State::Empty(query_graph.clone())); + } let mut previous_id = 0; for (_, _, id) in exact_term_position_ids.iter().copied() { if id < previous_id || id - previous_id > 1 { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } else { previous_id = id; } @@ -102,11 +146,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { .collect(); for (words, position) in &words_positions { if candidates.is_empty() { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } 'words: for (offset, word) in words.iter().enumerate() { @@ -116,8 +156,11 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { } else { continue 'words; }; + // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of + // longer phrases we'll be losing on precision here. + let bucketed_position = crate::bucketed_position(position + offset); let word_position_docids = CboRoaringBitmapCodec::bytes_decode( - ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), + ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(), ) .unwrap_or_default(); candidates &= word_position_docids; @@ -127,16 +170,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { let candidates = candidates; if candidates.is_empty() { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); - let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); + let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms for fid in searchable_fields_ids { @@ -156,20 +195,59 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { )?; intersection &= &candidates; if !intersection.is_empty() { - candidates_per_attributes.push(intersection); + let candidates_with_exact_word_count = ctx + .index + .field_id_word_count_docids + .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))? + .unwrap_or_default(); + candidates_per_attribute.push(FieldCandidates { + start_with_exact: intersection, + exact_word_count: candidates_with_exact_word_count, + }); } } // note we could have "false positives" where there both exist different attributes that collectively // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. - let candidates = MultiOps::union(candidates_per_attributes.into_iter()); - Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) + Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute)) } - fn end_iteration( - &mut self, - _ctx: &mut SearchContext<'ctx>, - _logger: &mut dyn SearchLogger, - ) { + fn next( + state: State, + universe: &RoaringBitmap, + ) -> (State, Option>) { + let (state, output) = match state { + State::Uninitialized => (state, None), + State::ExactAttribute(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.iter().map( + |FieldCandidates { start_with_exact, exact_word_count }| { + start_with_exact & exact_word_count + }, + )); + candidates &= universe; + ( + State::AttributeStarts(query_graph.clone(), candidates_per_attribute), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::AttributeStarts(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map( + |FieldCandidates { mut start_with_exact, exact_word_count }| { + start_with_exact -= exact_word_count; + start_with_exact + }, + )); + candidates &= universe; + ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::Empty(query_graph) => ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }), + ), + }; + (state, output) } } From f7ecea142ec3c3d1403ad00969e37d211c861125 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:43:16 +0200 Subject: [PATCH 07/10] Fix panics and issues in exactness graph ranking rule --- .../new/ranking_rule_graph/exactness/mod.rs | 48 ++++++++----------- .../extract/extract_word_position_docids.rs | 5 +- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index a1e19a015..3d558e87b 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -1,24 +1,11 @@ +use heed::BytesDecode; use roaring::RoaringBitmap; use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::query_graph::{QueryGraph, QueryNode}; use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; -use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; - -/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other -/// word than a doc that matches 9 words non exactly but none exactly -/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider -/// -/// "naive vision" -/// condition from one node to another: -/// - word exactly present: cost 0 -/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside -/// -/// Three strategies: -/// 1. ExactAttribute: word position / word_fid_docid -/// 2. AttributeStart: -/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait` +use crate::{Result, RoaringBitmapCodec, SearchContext, SearchLogger}; #[derive(Clone, PartialEq, Eq, Hash)] pub enum ExactnessCondition { @@ -42,7 +29,7 @@ fn compute_docids( ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), ExactTerm::Word(word) => { if let Some(word_candidates) = ctx.get_db_word_docids(word)? { - CboRoaringBitmapCodec::deserialize_from(word_candidates)? + RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)? } else { return Ok(Default::default()); } @@ -86,22 +73,29 @@ impl RankingRuleGraphTrait for ExactnessGraph { let skip_condition = ExactnessCondition::Skip(dest_node.clone()); let skip_condition = conditions_interner.insert(skip_condition); - Ok(vec![(0, exact_condition), (1, skip_condition)]) + + Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)]) } fn log_state( - graph: &RankingRuleGraph, - paths: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - costs: &MappedInterner>, - cost: u64, - logger: &mut dyn SearchLogger, + _graph: &RankingRuleGraph, + _paths: &[Vec>], + _dead_ends_cache: &DeadEndsCache, + _niverse: &RoaringBitmap, + _costs: &MappedInterner>, + _cost: u64, + _logger: &mut dyn SearchLogger, ) { - todo!() } - fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { - todo!() + fn label_for_condition( + _ctx: &mut SearchContext, + condition: &Self::Condition, + ) -> Result { + Ok(match condition { + ExactnessCondition::ExactInAttribute(_) => "exact", + ExactnessCondition::Skip(_) => "skip", + } + .to_owned()) } } diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index cd3ec691b..eef5089bc 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -7,10 +7,7 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{ - absolute_from_relative_position, bucketed_position, relative_from_absolute_position, - DocumentId, Result, -}; +use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result}; /// Extracts the word positions and the documents ids where this word appear. /// From d1ddaa223d39b7ba74fde7f9a72b04662931935f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 18:05:44 +0200 Subject: [PATCH 08/10] Use correct codec in proximity --- .../ranking_rule_graph/proximity/compute_docids.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 8496054b7..07bd102ca 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,14 +1,17 @@ #![allow(clippy::too_many_arguments)] +use std::collections::BTreeSet; + +use heed::BytesDecode; +use roaring::RoaringBitmap; + use super::ProximityCondition; use crate::search::new::interner::Interned; use crate::search::new::query_term::{Phrase, QueryTermSubset}; use crate::search::new::ranking_rule_graph::ComputedCondition; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Result}; -use roaring::RoaringBitmap; -use std::collections::BTreeSet; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; pub fn compute_docids( ctx: &mut SearchContext, @@ -90,7 +93,8 @@ pub fn compute_docids( continue; } } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? { - let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?; + let left_word_docids = + RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?; if universe.is_disjoint(&left_word_docids) { continue; } From d9460a76f43fa7b35cb2e4d423148c2a0ab174e5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:43:42 +0200 Subject: [PATCH 09/10] Fix word_position_docids indexing --- .../index_documents/extract/extract_word_position_docids.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index eef5089bc..734cf8778 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -39,9 +39,8 @@ pub fn extract_word_fid_and_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - let (fid, position) = relative_from_absolute_position(position); + let (_fid, position) = relative_from_absolute_position(position); let position = bucketed_position(position); - let position = absolute_from_relative_position(fid, position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } From 5440f43fd3be28981933bd55e126ee88ed3324e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 14:55:02 +0200 Subject: [PATCH 10/10] Fix indexing of word_position_docid and fid --- milli/src/snapshot_tests.rs | 8 +++ .../extract/extract_word_fid_docids.rs | 48 ++++++++++++++++ .../extract/extract_word_position_docids.rs | 4 +- .../src/update/index_documents/extract/mod.rs | 17 +++++- milli/src/update/index_documents/mod.rs | 57 +++++++++++++++++++ .../src/update/index_documents/typed_chunk.rs | 12 ++++ 6 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_word_fid_docids.rs diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f7f1a97e6..eb94c4be9 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) }) } +pub fn snap_word_fid_docids(index: &Index) -> String { + make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| { + &format!("{word:<16} {fid:<3} {}", display_bitmap(&b)) + }) +} pub fn snap_field_id_word_count_docids(index: &Index) -> String { make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| { &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) @@ -477,6 +482,9 @@ macro_rules! full_snap_of_db { ($index:ident, word_position_docids) => {{ $crate::snapshot_tests::snap_word_position_docids(&$index) }}; + ($index:ident, word_fid_docids) => {{ + $crate::snapshot_tests::snap_word_fid_docids(&$index) + }}; ($index:ident, field_id_word_count_docids) => {{ $crate::snapshot_tests::snap_field_id_word_count_docids(&$index) }}; diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs new file mode 100644 index 000000000..72b30cddf --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs @@ -0,0 +1,48 @@ +use std::fs::File; +use std::io; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::{relative_from_absolute_position, DocumentId, Result}; + +/// Extracts the word, field id, and the documents ids where this word appear at this field id. +#[logging_timer::time] +pub fn extract_word_fid_docids( + docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_fid_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = DocumentId::from_be_bytes(document_id_bytes); + + for position in read_u32_ne_bytes(value) { + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + let (fid, _) = relative_from_absolute_position(position); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + } + + let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; + + Ok(word_fid_docids_reader) +} diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 734cf8778..80a36c308 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -14,7 +14,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_fid_and_position_docids( +pub fn extract_word_position_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { @@ -39,7 +39,7 @@ pub fn extract_word_fid_and_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - let (_fid, position) = relative_from_absolute_position(position); + let (_, position) = relative_from_absolute_position(position); let position = bucketed_position(position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 844efed36..db041de6f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values; mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_word_docids; +mod extract_word_fid_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; @@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; +use self::extract_word_fid_docids::extract_word_fid_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; -use self::extract_word_position_docids::extract_word_fid_and_position_docids; +use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, @@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>( - docid_word_positions_chunks, + docid_word_positions_chunks.clone(), indexer, lmdb_writer_sx.clone(), - extract_word_fid_and_position_docids, + extract_word_position_docids, merge_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", ); + spawn_extraction_task::<_, _, Vec>>( + docid_word_positions_chunks, + indexer, + lmdb_writer_sx.clone(), + extract_word_fid_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordFidDocids, + "word-fid-docids", + ); spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_strings_chunks, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ade217beb..235b35fc8 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2255,4 +2255,61 @@ mod tests { {"id":1,"catto":"jorts"} "###); } + + #[test] + fn test_word_fid_position() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + {"id": 0, "text": "sun flowers are looking at the sun" }, + {"id": 1, "text": "sun flowers are looking at the sun" }, + {"id": 2, "text": "the sun is shining today" }, + { + "id": 3, + "text": "a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a " + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9"); + db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f"); + + index + .add_documents(documents!([ + {"id": 4, "text": "sun flowers are looking at the sun" }, + {"id": 5, "text2": "sun flowers are looking at the sun" }, + {"id": 6, "text": "b b b" }, + { + "id": 7, + "text2": "a a a a" + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); + db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.strategy(DeletionStrategy::AlwaysHard); + builder.delete_external_id("0"); + builder.delete_external_id("3"); + let result = builder.execute().unwrap(); + println!("{result:?}"); + + wtxn.commit().unwrap(); + + db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); + db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); + db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1"); + } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b9b11cfa8..14ba021bd 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,6 +35,7 @@ pub(crate) enum TypedChunk { exact_word_docids_reader: grenad::Reader, }, WordPositionDocids(grenad::Reader), + WordFidDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), @@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::WordFidDocids(word_fid_docids_iter) => { + append_entries_into_database( + word_fid_docids_iter, + &index.word_fid_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?;