From 3296bb243c817c16c0b47603b6760f69f1409c56 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 5 Oct 2021 11:18:42 +0200 Subject: [PATCH] Simplify word level position DB into a word position DB --- milli/src/heed_codec/mod.rs | 4 +- milli/src/heed_codec/str_beu32_codec.rs | 38 +++ .../heed_codec/str_level_position_codec.rs | 47 --- milli/src/index.rs | 23 +- milli/src/lib.rs | 4 +- milli/src/search/criteria/attribute.rs | 57 ++-- milli/src/search/criteria/exactness.rs | 7 +- milli/src/search/criteria/mod.rs | 104 ++----- milli/src/tree_level.rs | 51 ---- milli/src/update/clear_documents.rs | 8 +- milli/src/update/delete_documents.rs | 9 +- ...ids.rs => extract_word_position_docids.rs} | 12 +- .../src/update/index_documents/extract/mod.rs | 10 +- milli/src/update/index_documents/mod.rs | 6 +- .../src/update/index_documents/typed_chunk.rs | 8 +- milli/src/update/mod.rs | 4 +- milli/src/update/words_level_positions.rs | 268 ------------------ .../update/words_prefix_position_docids.rs | 105 +++++++ 18 files changed, 220 insertions(+), 545 deletions(-) create mode 100644 milli/src/heed_codec/str_beu32_codec.rs delete mode 100644 milli/src/heed_codec/str_level_position_codec.rs delete mode 100644 milli/src/tree_level.rs rename milli/src/update/index_documents/extract/{extract_word_level_position_docids.rs => extract_word_position_docids.rs} (76%) delete mode 100644 milli/src/update/words_level_positions.rs create mode 100644 milli/src/update/words_prefix_position_docids.rs diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 7bd7dff2d..2f2a01192 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -4,7 +4,7 @@ mod field_id_word_count_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; -mod str_level_position_codec; +mod str_beu32_codec; mod str_str_u8_codec; pub use self::beu32_str_codec::BEU32StrCodec; @@ -14,5 +14,5 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; -pub use self::str_level_position_codec::StrLevelPositionCodec; +pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/milli/src/heed_codec/str_beu32_codec.rs b/milli/src/heed_codec/str_beu32_codec.rs new file mode 100644 index 000000000..d1f379bdc --- /dev/null +++ b/milli/src/heed_codec/str_beu32_codec.rs @@ -0,0 +1,38 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::mem::size_of; +use std::str; + +pub struct StrBEU32Codec; + +impl<'a> heed::BytesDecode<'a> for StrBEU32Codec { + type DItem = (&'a str, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::(); + + if bytes.len() < footer_len { + return None; + } + + let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let word = str::from_utf8(word).ok()?; + let pos = bytes.try_into().map(u32::from_be_bytes).ok()?; + + Some((word, pos)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrBEU32Codec { + type EItem = (&'a str, u32); + + fn bytes_encode((word, pos): &Self::EItem) -> Option> { + let pos = pos.to_be_bytes(); + + let mut bytes = Vec::with_capacity(word.len() + pos.len()); + bytes.extend_from_slice(word.as_bytes()); + bytes.extend_from_slice(&pos[..]); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs deleted file mode 100644 index 5be45bbeb..000000000 --- a/milli/src/heed_codec/str_level_position_codec.rs +++ /dev/null @@ -1,47 +0,0 @@ -use std::borrow::Cow; -use std::convert::{TryFrom, TryInto}; -use std::mem::size_of; -use std::str; - -use crate::TreeLevel; - -pub struct StrLevelPositionCodec; - -impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { - type DItem = (&'a str, TreeLevel, u32, u32); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let footer_len = size_of::() + size_of::() * 2; - - if bytes.len() < footer_len { - return None; - } - - let (word, bytes) = bytes.split_at(bytes.len() - footer_len); - let word = str::from_utf8(word).ok()?; - - let (level, bytes) = bytes.split_first()?; - let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?; - let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?; - let level = TreeLevel::try_from(*level).ok()?; - - Some((word, level, left, right)) - } -} - -impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { - type EItem = (&'a str, TreeLevel, u32, u32); - - fn bytes_encode((word, level, left, right): &Self::EItem) -> Option> { - let left = left.to_be_bytes(); - let right = right.to_be_bytes(); - - let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len()); - bytes.extend_from_slice(word.as_bytes()); - bytes.push((*level).into()); - bytes.extend_from_slice(&left[..]); - bytes.extend_from_slice(&right[..]); - - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/index.rs b/milli/src/index.rs index dd5851ccc..6ce693fbe 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -20,7 +20,7 @@ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, StrLevelPositionCodec, StrStrU8Codec, BEU32, + Search, StrBEU32Codec, StrStrU8Codec, BEU32, }; pub mod main_key { @@ -55,8 +55,8 @@ pub mod db_name { pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; - pub const WORD_LEVEL_POSITION_DOCIDS: &str = "word-level-position-docids"; - pub const WORD_PREFIX_LEVEL_POSITION_DOCIDS: &str = "word-prefix-level-position-docids"; + pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; + pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; @@ -86,12 +86,12 @@ pub struct Index { /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, - /// Maps the word, level and position range with the docids that corresponds to it. - pub word_level_position_docids: Database, + /// Maps the word and the position with the docids that corresponds to it. + pub word_position_docids: Database, /// Maps the field id and the word count with the docids that corresponds to it. pub field_id_word_count_docids: Database, - /// Maps the level positions of a word prefix with all the docids where this prefix appears. - pub word_prefix_level_position_docids: Database, + /// Maps the position of a word prefix with all the docids where this prefix appears. + pub word_prefix_position_docids: Database, /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, @@ -122,10 +122,9 @@ impl Index { let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; - let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?; + let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; - let word_prefix_level_position_docids = - env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; + let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; @@ -143,8 +142,8 @@ impl Index { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, - word_level_position_docids, - word_prefix_level_position_docids, + word_position_docids, + word_prefix_position_docids, field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index bb0a32528..838817d98 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -14,7 +14,6 @@ pub mod heed_codec; pub mod index; pub mod proximity; mod search; -pub mod tree_level; pub mod update; use std::collections::{BTreeMap, HashMap}; @@ -35,11 +34,10 @@ pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, - RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, + RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, }; pub use self::index::Index; pub use self::search::{FacetDistribution, FilterCondition, MatchingWords, Search, SearchResult}; -pub use self::tree_level::TreeLevel; pub type Result = std::result::Result; diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 0e589dd92..07b3cf95c 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -10,7 +10,7 @@ use super::{resolve_query_tree, Context, Criterion, CriterionParameters, Criteri use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; use crate::search::{build_dfa, word_derivations, WordDerivationsCache}; -use crate::{Result, TreeLevel}; +use crate::Result; /// To be able to divide integers by the number of words in the query /// we want to find a multiplier that allow us to divide by any number between 1 and 10. @@ -176,20 +176,14 @@ impl<'t> Criterion for Attribute<'t> { } } -/// QueryLevelIterator is an pseudo-Iterator for a Query, -/// It contains WordLevelIterators and is chainned with other QueryLevelIterator. -struct QueryLevelIterator<'t> { - inner: Vec< - Peekable< - Box< - dyn Iterator> - + 't, - >, - >, - >, +/// QueryPositionIterator is an Iterator over positions of a Query, +/// It contains iterators over words positions. +struct QueryPositionIterator<'t> { + inner: + Vec> + 't>>>, } -impl<'t> QueryLevelIterator<'t> { +impl<'t> QueryPositionIterator<'t> { fn new( ctx: &'t dyn Context<'t>, queries: &[Query], @@ -201,25 +195,14 @@ impl<'t> QueryLevelIterator<'t> { match &query.kind { QueryKind::Exact { word, .. } => { if !query.prefix || in_prefix_cache { - let iter = ctx.word_position_iterator( - query.kind.word(), - TreeLevel::min_value(), - in_prefix_cache, - None, - None, - )?; + let iter = + ctx.word_position_iterator(query.kind.word(), in_prefix_cache)?; inner.push(iter.peekable()); } else { for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { - let iter = ctx.word_position_iterator( - &word, - TreeLevel::min_value(), - in_prefix_cache, - None, - None, - )?; + let iter = ctx.word_position_iterator(&word, in_prefix_cache)?; inner.push(iter.peekable()); } @@ -229,13 +212,7 @@ impl<'t> QueryLevelIterator<'t> { for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { - let iter = ctx.word_position_iterator( - &word, - TreeLevel::min_value(), - in_prefix_cache, - None, - None, - )?; + let iter = ctx.word_position_iterator(&word, in_prefix_cache)?; inner.push(iter.peekable()); } @@ -247,7 +224,7 @@ impl<'t> QueryLevelIterator<'t> { } } -impl<'t> Iterator for QueryLevelIterator<'t> { +impl<'t> Iterator for QueryPositionIterator<'t> { type Item = heed::Result<(u32, RoaringBitmap)>; fn next(&mut self) -> Option { @@ -256,14 +233,14 @@ impl<'t> Iterator for QueryLevelIterator<'t> { .inner .iter_mut() .filter_map(|wli| match wli.peek() { - Some(Ok(((_, _, pos, _), _))) => Some(*pos), + Some(Ok(((_, pos), _))) => Some(*pos), _ => None, }) .min()?; let mut candidates = None; for wli in self.inner.iter_mut() { - if let Some(Ok(((_, _, pos, _), _))) = wli.peek() { + if let Some(Ok(((_, pos), _))) = wli.peek() { if *pos > expected_pos { continue; } @@ -286,9 +263,9 @@ impl<'t> Iterator for QueryLevelIterator<'t> { } /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, -/// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates. +/// This branch allows us to iterate over meta-interval of positions. struct Branch<'t> { - query_level_iterator: Vec<(u32, RoaringBitmap, Peekable>)>, + query_level_iterator: Vec<(u32, RoaringBitmap, Peekable>)>, last_result: (u32, RoaringBitmap), branch_size: u32, } @@ -302,7 +279,7 @@ impl<'t> Branch<'t> { ) -> Result { let mut query_level_iterator = Vec::new(); for queries in flatten_branch { - let mut qli = QueryLevelIterator::new(ctx, queries, wdcache)?.peekable(); + let mut qli = QueryPositionIterator::new(ctx, queries, wdcache)?.peekable(); let (pos, docids) = qli.next().transpose()?.unwrap_or((0, RoaringBitmap::new())); query_level_iterator.push((pos, docids & allowed_candidates, qli)); } diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 1e4d4e7a2..8e56b3649 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -10,7 +10,7 @@ use crate::search::criteria::{ resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; -use crate::{Result, TreeLevel}; +use crate::Result; pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, @@ -293,7 +293,6 @@ fn attribute_start_with_docids( attribute_id: u32, query: &[ExactQueryPart], ) -> heed::Result> { - let lowest_level = TreeLevel::min_value(); let mut attribute_candidates_array = Vec::new(); // start from attribute first position let mut pos = attribute_id * 1000; @@ -303,7 +302,7 @@ fn attribute_start_with_docids( Synonyms(synonyms) => { let mut synonyms_candidates = RoaringBitmap::new(); for word in synonyms { - let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; + let wc = ctx.word_position_docids(word, pos)?; if let Some(word_candidates) = wc { synonyms_candidates |= word_candidates; } @@ -313,7 +312,7 @@ fn attribute_start_with_docids( } Phrase(phrase) => { for word in phrase { - let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; + let wc = ctx.word_position_docids(word, pos)?; if let Some(word_candidates) = wc { attribute_candidates_array.push(word_candidates); } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index a23e5acf9..0cad7c013 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -14,7 +14,7 @@ use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; -use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result, TreeLevel}; +use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; mod asc_desc; mod attribute; @@ -90,20 +90,8 @@ pub trait Context<'c> { fn word_position_iterator( &self, word: &str, - level: TreeLevel, in_prefix_cache: bool, - left: Option, - right: Option, - ) -> heed::Result< - Box< - dyn Iterator> + 'c, - >, - >; - fn word_position_last_level( - &self, - word: &str, - in_prefix_cache: bool, - ) -> heed::Result>; + ) -> heed::Result> + 'c>>; fn synonyms(&self, word: &str) -> heed::Result>>>; fn searchable_fields_ids(&self) -> Result>; fn field_id_word_count_docids( @@ -111,13 +99,7 @@ pub trait Context<'c> { field_id: FieldId, word_count: u8, ) -> heed::Result>; - fn word_level_position_docids( - &self, - word: &str, - level: TreeLevel, - left: u32, - right: u32, - ) -> heed::Result>; + fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result>; } pub struct CriteriaBuilder<'t> { @@ -183,54 +165,24 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { fn word_position_iterator( &self, word: &str, - level: TreeLevel, in_prefix_cache: bool, - left: Option, - right: Option, - ) -> heed::Result< - Box< - dyn Iterator> + 'c, - >, - > { + ) -> heed::Result> + 'c>> + { let range = { - let left = left.unwrap_or(u32::min_value()); - let right = right.unwrap_or(u32::max_value()); - let left = (word, level, left, left); - let right = (word, level, right, right); + let left = u32::min_value(); + let right = u32::max_value(); + let left = (word, left); + let right = (word, right); left..=right }; let db = match in_prefix_cache { - true => self.index.word_prefix_level_position_docids, - false => self.index.word_level_position_docids, + true => self.index.word_prefix_position_docids, + false => self.index.word_position_docids, }; Ok(Box::new(db.range(self.rtxn, &range)?)) } - fn word_position_last_level( - &self, - word: &str, - in_prefix_cache: bool, - ) -> heed::Result> { - let range = { - let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); - let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); - left..=right - }; - let db = match in_prefix_cache { - true => self.index.word_prefix_level_position_docids, - false => self.index.word_level_position_docids, - }; - let last_level = db - .remap_data_type::() - .range(self.rtxn, &range)? - .last() - .transpose()? - .map(|((_, level, _, _), _)| level); - - Ok(last_level) - } - fn synonyms(&self, word: &str) -> heed::Result>>> { self.index.words_synonyms(self.rtxn, &[word]) } @@ -251,15 +203,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.field_id_word_count_docids.get(self.rtxn, &key) } - fn word_level_position_docids( - &self, - word: &str, - level: TreeLevel, - left: u32, - right: u32, - ) -> heed::Result> { - let key = (word, level, left, right); - self.index.word_level_position_docids.get(self.rtxn, &key) + fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result> { + let key = (word, pos); + self.index.word_position_docids.get(self.rtxn, &key) } } @@ -616,27 +562,13 @@ pub mod test { fn word_position_iterator( &self, _word: &str, - _level: TreeLevel, _in_prefix_cache: bool, - _left: Option, - _right: Option, ) -> heed::Result< - Box< - dyn Iterator> - + 'c, - >, + Box> + 'c>, > { todo!() } - fn word_position_last_level( - &self, - _word: &str, - _in_prefix_cache: bool, - ) -> heed::Result> { - todo!() - } - fn synonyms(&self, _word: &str) -> heed::Result>>> { todo!() } @@ -645,12 +577,10 @@ pub mod test { todo!() } - fn word_level_position_docids( + fn word_position_docids( &self, _word: &str, - _level: TreeLevel, - _left: u32, - _right: u32, + _pos: u32, ) -> heed::Result> { todo!() } diff --git a/milli/src/tree_level.rs b/milli/src/tree_level.rs deleted file mode 100644 index b69316cf6..000000000 --- a/milli/src/tree_level.rs +++ /dev/null @@ -1,51 +0,0 @@ -use std::convert::TryFrom; -use std::fmt; - -/// This is just before the lowest printable character (space, sp, 32) -const MAX_VALUE: u8 = 31; - -#[derive(Debug, Copy, Clone)] -pub enum Error { - LevelTooHigh(u8), -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -#[repr(transparent)] -pub struct TreeLevel(u8); - -impl TreeLevel { - pub const fn max_value() -> TreeLevel { - TreeLevel(MAX_VALUE) - } - - pub const fn min_value() -> TreeLevel { - TreeLevel(0) - } - - pub fn saturating_sub(&self, lhs: u8) -> TreeLevel { - TreeLevel(self.0.saturating_sub(lhs)) - } -} - -impl Into for TreeLevel { - fn into(self) -> u8 { - self.0 - } -} - -impl TryFrom for TreeLevel { - type Error = Error; - - fn try_from(value: u8) -> Result { - match value { - 0..=MAX_VALUE => Ok(TreeLevel(value)), - _ => Err(Error::LevelTooHigh(value)), - } - } -} - -impl fmt::Display for TreeLevel { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.0) - } -} diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ea4193eaf..a820c2a49 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -28,9 +28,9 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, - word_level_position_docids, + word_position_docids, field_id_word_count_docids, - word_prefix_level_position_docids, + word_prefix_position_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s, @@ -64,9 +64,9 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; - word_level_position_docids.clear(self.wtxn)?; + word_position_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; - word_prefix_level_position_docids.clear(self.wtxn)?; + word_prefix_position_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 1b16ba9bf..207aed63c 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -102,8 +102,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_pair_proximity_docids, field_id_word_count_docids, word_prefix_pair_proximity_docids, - word_level_position_docids, - word_prefix_level_position_docids, + word_position_docids, + word_prefix_position_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s, @@ -326,8 +326,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the word level position docids. - let mut iter = - word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); @@ -346,7 +345,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We delete the documents ids that are under the word prefix level position docids. let mut iter = - word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); diff --git a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs similarity index 76% rename from milli/src/update/index_documents/extract/extract_word_level_position_docids.rs rename to milli/src/update/index_documents/extract/extract_word_position_docids.rs index 04cedf5c7..4ca8537ac 100644 --- a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -14,13 +14,13 @@ use crate::{DocumentId, Result}; /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_level_position_docids( +pub fn extract_word_position_docids( mut docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { let max_memory = indexer.max_memory_by_thread(); - let mut word_level_position_docids_sorter = create_sorter( + let mut word_position_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -37,15 +37,11 @@ pub fn extract_word_level_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - key_buffer.push(0); // tree level - - // Levels are composed of left and right bounds. - key_buffer.extend_from_slice(&position.to_be_bytes()); key_buffer.extend_from_slice(&position.to_be_bytes()); - word_level_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; + word_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; } } - sorter_into_reader(word_level_position_docids_sorter, indexer) + sorter_into_reader(word_position_docids_sorter, indexer) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 47a62be67..0406e8ef4 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -5,8 +5,8 @@ mod extract_fid_docid_facet_values; mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_word_docids; -mod extract_word_level_position_docids; mod extract_word_pair_proximity_docids; +mod extract_word_position_docids; use std::collections::HashSet; use std::fs::File; @@ -22,8 +22,8 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; -use self::extract_word_level_position_docids::extract_word_level_position_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; +use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ into_clonable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, @@ -98,10 +98,10 @@ pub(crate) fn data_from_obkv_documents( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), - extract_word_level_position_docids, + extract_word_position_docids, merge_cbo_roaring_bitmaps, - TypedChunk::WordLevelPositionDocids, - "word-level-position-docids", + TypedChunk::WordPositionDocids, + "word-position-docids", ); spawn_extraction_task( diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 30ee49893..b0dbd9c3e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -27,7 +27,7 @@ pub use self::transform::{Transform, TransformOutput}; use crate::documents::DocumentBatchReader; use crate::update::{ Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, - WordsLevelPositions, WordsPrefixesFst, + WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result}; @@ -412,8 +412,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - // Run the words level positions update operation. - let mut builder = WordsLevelPositions::new(self.wtxn, self.index); + // Run the words prefix position docids update operation. + let mut builder = WordPrefixPositionDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.max_nb_chunks = self.max_nb_chunks; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b17f28b66..b24a03ff6 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -22,7 +22,7 @@ pub(crate) enum TypedChunk { FieldIdWordcountDocids(grenad::Reader), NewDocumentsIds(RoaringBitmap), WordDocids(grenad::Reader), - WordLevelPositionDocids(grenad::Reader), + WordPositionDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), @@ -110,10 +110,10 @@ pub(crate) fn write_typed_chunk_into_index( index.put_words_fst(wtxn, &fst)?; is_merged_database = true; } - TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => { + TypedChunk::WordPositionDocids(word_position_docids_iter) => { append_entries_into_database( - word_level_position_docids_iter, - &index.word_level_position_docids, + word_position_docids_iter, + &index.word_position_docids, wtxn, index_is_empty, |value, _buffer| Ok(value), diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index d80437ec7..3b6edb0a3 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -8,7 +8,7 @@ pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; -pub use self::words_level_positions::WordsLevelPositions; +pub use self::words_prefix_position_docids::WordPrefixPositionDocids; pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; @@ -21,5 +21,5 @@ mod update_builder; mod update_step; mod word_prefix_docids; mod word_prefix_pair_proximity_docids; -mod words_level_positions; +mod words_prefix_position_docids; mod words_prefixes_fst; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs deleted file mode 100644 index 0af51fbb2..000000000 --- a/milli/src/update/words_level_positions.rs +++ /dev/null @@ -1,268 +0,0 @@ -use std::convert::TryFrom; -use std::fs::File; -use std::num::NonZeroU32; -use std::{cmp, str}; - -use fst::Streamer; -use grenad::{CompressionType, Reader, Writer}; -use heed::types::{ByteSlice, DecodeIgnore, Str}; -use heed::{BytesEncode, Error}; -use log::debug; -use roaring::RoaringBitmap; - -use crate::error::{InternalError, SerializationError}; -use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; -use crate::index::main_key::WORDS_PREFIXES_FST_KEY; -use crate::update::index_documents::{ - create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, - write_into_lmdb_database, writer_into_reader, WriteMethod, -}; -use crate::{Index, Result, TreeLevel}; - -pub struct WordsLevelPositions<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, - level_group_size: NonZeroU32, - min_level_size: NonZeroU32, -} - -impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordsLevelPositions<'t, 'u, 'i> { - WordsLevelPositions { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - max_nb_chunks: None, - max_memory: None, - level_group_size: NonZeroU32::new(4).unwrap(), - min_level_size: NonZeroU32::new(5).unwrap(), - } - } - - pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { - self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); - self - } - - pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { - self.min_level_size = value; - self - } - - #[logging_timer::time("WordsLevelPositions::{}")] - pub fn execute(self) -> Result<()> { - debug!("Computing and writing the word levels positions docids into LMDB on disk..."); - - let entries = compute_positions_levels( - self.wtxn, - self.index.word_docids.remap_data_type::(), - self.index.word_level_position_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.level_group_size, - self.min_level_size, - )?; - - // The previously computed entries also defines the level 0 entries - // so we can clear the database and append all of these entries. - self.index.word_level_position_docids.clear(self.wtxn)?; - - write_into_lmdb_database( - self.wtxn, - *self.index.word_level_position_docids.as_polymorph(), - entries, - |_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" })?, - WriteMethod::Append, - )?; - - // We compute the word prefix level positions database. - self.index.word_prefix_level_position_docids.clear(self.wtxn)?; - - let mut word_prefix_level_positions_docids_sorter = create_sorter( - merge_cbo_roaring_bitmaps, - self.chunk_compression_type, - self.chunk_compression_level, - self.max_nb_chunks, - self.max_memory, - ); - - // We insert the word prefix level positions where the level is equal to 0 and - // corresponds to the word-prefix level positions where the prefixes appears - // in the prefix FST previously constructed. - let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - let db = self.index.word_level_position_docids.remap_data_type::(); - // iter over all prefixes in the prefix fst. - let mut word_stream = prefix_fst.stream(); - while let Some(prefix_bytes) = word_stream.next() { - let prefix = str::from_utf8(prefix_bytes).map_err(|_| { - SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } - })?; - - // iter over all lines of the DB where the key is prefixed by the current prefix. - let mut iter = db - .remap_key_type::() - .prefix_iter(self.wtxn, &prefix_bytes)? - .remap_key_type::(); - while let Some(((_word, level, left, right), data)) = iter.next().transpose()? { - // if level is 0, we push the line in the sorter - // replacing the complete word by the prefix. - if level == TreeLevel::min_value() { - let key = (prefix, level, left, right); - let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap(); - word_prefix_level_positions_docids_sorter.insert(bytes, data)?; - } - } - } - - // We finally write all the word prefix level positions docids with - // a level equal to 0 into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_level_position_docids.as_polymorph(), - word_prefix_level_positions_docids_sorter, - merge_cbo_roaring_bitmaps, - WriteMethod::Append, - )?; - - let entries = compute_positions_levels( - self.wtxn, - self.index.word_prefix_docids.remap_data_type::(), - self.index.word_prefix_level_position_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.level_group_size, - self.min_level_size, - )?; - - // The previously computed entries also defines the level 0 entries - // so we can clear the database and append all of these entries. - self.index.word_prefix_level_position_docids.clear(self.wtxn)?; - - write_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_level_position_docids.as_polymorph(), - entries, - |_, _| { - Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })? - }, - WriteMethod::Append, - )?; - - Ok(()) - } -} - -/// Returns the next number after or equal to `x` that is divisible by `d`. -fn next_divisible(x: u32, d: u32) -> u32 { - (x.saturating_sub(1) | (d - 1)) + 1 -} - -/// Returns the previous number after or equal to `x` that is divisible by `d`, -/// saturates on zero. -fn previous_divisible(x: u32, d: u32) -> u32 { - match x.checked_sub(d - 1) { - Some(0) | None => 0, - Some(x) => next_divisible(x, d), - } -} - -/// Generates all the words positions levels based on the levels zero (including the level zero). -fn compute_positions_levels( - rtxn: &heed::RoTxn, - words_db: heed::Database, - words_positions_db: heed::Database, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroU32, - min_level_size: NonZeroU32, -) -> Result> { - // It is forbidden to keep a cursor and write in a database at the same time with LMDB - // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = tempfile::tempfile() - .and_then(|file| create_writer(compression_type, compression_level, file))?; - - for result in words_db.iter(rtxn)? { - let (word, ()) = result?; - - let level_0_range = { - let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); - let right = (word, TreeLevel::min_value(), u32::max_value(), u32::max_value()); - left..=right - }; - - let first_level_size = words_positions_db - .remap_data_type::() - .range(rtxn, &level_0_range)? - .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (TreeLevel::try_from(l).unwrap(), level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); - - // As specified in the documentation, we also write the level 0 entries. - for result in words_positions_db.range(rtxn, &level_0_range)? { - let ((word, level, left, right), docids) = result?; - write_level_entry(&mut writer, word, level, left, right, &docids)?; - } - - for (level, group_size) in group_size_iter { - let mut left = 0; - let mut right = 0; - let mut group_docids = RoaringBitmap::new(); - - for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() { - let ((_word, _level, value, _right), docids) = result?; - - if i == 0 { - left = previous_divisible(value, group_size); - right = left + (group_size - 1); - } - - if value > right { - // we found the first bound of the next group, we must store the left - // and right bounds associated with the docids. - write_level_entry(&mut writer, word, level, left, right, &group_docids)?; - - // We save the left bound for the new group and also reset the docids. - group_docids = RoaringBitmap::new(); - left = previous_divisible(value, group_size); - right = left + (group_size - 1); - } - - // The right bound is always the bound we run through. - group_docids |= docids; - } - - if !group_docids.is_empty() { - write_level_entry(&mut writer, word, level, left, right, &group_docids)?; - } - } - } - - writer_into_reader(writer) -} - -fn write_level_entry( - writer: &mut Writer, - word: &str, - level: TreeLevel, - left: u32, - right: u32, - ids: &RoaringBitmap, -) -> Result<()> { - let key = (word, level, left, right); - let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) -} diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs new file mode 100644 index 000000000..a8346a1cb --- /dev/null +++ b/milli/src/update/words_prefix_position_docids.rs @@ -0,0 +1,105 @@ +use std::num::NonZeroU32; +use std::{cmp, str}; + +use fst::Streamer; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesEncode; +use log::debug; + +use crate::error::SerializationError; +use crate::heed_codec::StrBEU32Codec; +use crate::index::main_key::WORDS_PREFIXES_FST_KEY; +use crate::update::index_documents::{ + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, +}; +use crate::{Index, Result}; + +pub struct WordPrefixPositionDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, +} + +impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordPrefixPositionDocids<'t, 'u, 'i> { + WordPrefixPositionDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + max_nb_chunks: None, + max_memory: None, + level_group_size: NonZeroU32::new(4).unwrap(), + min_level_size: NonZeroU32::new(5).unwrap(), + } + } + + pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { + self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); + self + } + + pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { + self.min_level_size = value; + self + } + + #[logging_timer::time("WordPrefixPositionDocids::{}")] + pub fn execute(self) -> Result<()> { + debug!("Computing and writing the word levels positions docids into LMDB on disk..."); + + self.index.word_prefix_position_docids.clear(self.wtxn)?; + + let mut word_prefix_positions_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + self.chunk_compression_type, + self.chunk_compression_level, + self.max_nb_chunks, + self.max_memory, + ); + + // We insert the word prefix position and + // corresponds to the word-prefix position where the prefixes appears + // in the prefix FST previously constructed. + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + let db = self.index.word_position_docids.remap_data_type::(); + // iter over all prefixes in the prefix fst. + let mut word_stream = prefix_fst.stream(); + while let Some(prefix_bytes) = word_stream.next() { + let prefix = str::from_utf8(prefix_bytes).map_err(|_| { + SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } + })?; + + // iter over all lines of the DB where the key is prefixed by the current prefix. + let mut iter = db + .remap_key_type::() + .prefix_iter(self.wtxn, &prefix_bytes)? + .remap_key_type::(); + while let Some(((_word, pos), data)) = iter.next().transpose()? { + let key = (prefix, pos); + let bytes = StrBEU32Codec::bytes_encode(&key).unwrap(); + word_prefix_positions_docids_sorter.insert(bytes, data)?; + } + } + + // We finally write all the word prefix position docids into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_position_docids.as_polymorph(), + word_prefix_positions_docids_sorter, + merge_cbo_roaring_bitmaps, + WriteMethod::Append, + )?; + + Ok(()) + } +}