diff --git a/Cargo.lock b/Cargo.lock index bbe86a2a7..6a30891ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,12 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "big_s" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8" + [[package]] name = "bincode" version = "1.3.1" @@ -1251,6 +1257,7 @@ name = "milli" version = "0.1.1" dependencies = [ "anyhow", + "big_s", "bstr", "byteorder", "chrono", @@ -1957,9 +1964,9 @@ checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1" [[package]] name = "roaring" -version = "0.6.5" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6744a4a918e91359ad1d356a91e2e943a86d9fb9ae77f715d617032ea2af88f" +checksum = "a4b2e7ab0bbb2d144558ae3f4761a0db06d21463b45756fc64c3393cdba3d447" dependencies = [ "bytemuck", "byteorder", diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b091985f3..00618f58a 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; use std::net::SocketAddr; -use std::num::NonZeroUsize; +use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; @@ -228,7 +228,6 @@ enum UpdateMeta { ClearDocuments, Settings(Settings), Facets(Facets), - WordsPrefixes(WordsPrefixes), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -281,6 +280,14 @@ struct WordsPrefixes { max_prefix_length: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +struct WordsLevelPositions { + level_group_size: Option, + min_level_size: Option, +} + #[tokio::main] async fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); @@ -479,21 +486,6 @@ async fn main() -> anyhow::Result<()> { Err(e) => Err(e) } } - UpdateMeta::WordsPrefixes(settings) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.words_prefixes(&mut wtxn, &index_cloned); - if let Some(value) = settings.threshold { - builder.threshold(value); - } - if let Some(value) = settings.max_prefix_length { - builder.max_prefix_length(value); - } - match builder.execute() { - Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) - } - } }; let meta = match result { @@ -910,19 +902,6 @@ async fn main() -> anyhow::Result<()> { warp::reply() }); - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let change_words_prefixes_route = warp::filters::method::post() - .and(warp::path!("words-prefixes")) - .and(warp::body::json()) - .map(move |settings: WordsPrefixes| { - let meta = UpdateMeta::WordsPrefixes(settings); - let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); - let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); - eprintln!("update {} registered", update_id); - warp::reply() - }); - let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); let abort_update_id_route = warp::filters::method::delete() @@ -997,7 +976,6 @@ async fn main() -> anyhow::Result<()> { .or(clearing_route) .or(change_settings_route) .or(change_facet_levels_route) - .or(change_words_prefixes_route) .or(update_ws_route); let addr = SocketAddr::from_str(&opt.http_listen_addr)?; diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 59cfbd661..8b5867fde 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -11,7 +11,7 @@ csv = "1.1.5" heed = "0.10.6" jemallocator = "0.3.2" milli = { path = "../milli" } -roaring = "0.6.5" +roaring = "0.6.6" serde_json = "1.0.62" stderrlog = "0.5.1" structopt = { version = "0.3.21", default-features = false } diff --git a/infos/src/main.rs b/infos/src/main.rs index cc1727a68..902394af8 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -5,7 +5,7 @@ use std::{str, io, fmt}; use anyhow::Context; use byte_unit::Byte; use heed::EnvOpenOptions; -use milli::Index; +use milli::{Index, TreeLevel}; use structopt::StructOpt; use Command::*; @@ -19,9 +19,11 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids"; const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids"; const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; -const FACET_FIELD_ID_VALUE_DOCIDS_NAME: &str = "facet-field-id-value-docids"; -const FIELD_ID_DOCID_FACET_VALUES_NAME: &str = "field-id-docid-facet-values"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; +const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; +const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; +const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids"; +const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values"; const DOCUMENTS_DB_NAME: &str = "documents"; const ALL_DATABASE_NAMES: &[&str] = &[ @@ -31,8 +33,10 @@ const ALL_DATABASE_NAMES: &[&str] = &[ DOCID_WORD_POSITIONS_DB_NAME, WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, - FACET_FIELD_ID_VALUE_DOCIDS_NAME, - FIELD_ID_DOCID_FACET_VALUES_NAME, + WORD_LEVEL_POSITION_DOCIDS_DB_NAME, + WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, + FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME, + FIELD_ID_DOCID_FACET_VALUES_DB_NAME, DOCUMENTS_DB_NAME, ]; @@ -114,6 +118,27 @@ enum Command { field_name: String, }, + /// Outputs a CSV with the documents ids along with the word level positions where it appears. + WordsLevelPositionsDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// Words appearing in the documents. + words: Vec, + }, + + /// Outputs a CSV with the documents ids along with + /// the word prefix level positions where it appears. + WordPrefixesLevelPositionsDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// Prefixes of words appearing in the documents. + prefixes: Vec, + }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. DocidsWordsPositions { /// Display the whole positions in detail. @@ -221,6 +246,12 @@ fn main() -> anyhow::Result<()> { FacetValuesDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, field_name) }, + WordsLevelPositionsDocids { full_display, words } => { + words_level_positions_docids(&index, &rtxn, !full_display, words) + }, + WordPrefixesLevelPositionsDocids { full_display, prefixes } => { + word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) + }, DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) }, @@ -319,9 +350,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values: _, - documents, + documents } = index; let main_name = "main"; @@ -330,6 +363,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let docid_word_positions_name = "docid_word_positions"; let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids"; + let word_level_position_docids_name = "word_level_position_docids"; + let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; let facet_field_id_value_docids_name = "facet_field_id_value_docids"; let documents_name = "documents"; @@ -386,6 +421,20 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in word_level_position_docids.remap_data_type::().iter(rtxn)? { + let ((word, level, left, right), value) = result?; + let key = format!("{} {} {:?}", word, level, left..=right); + heap.push(Reverse((value.len(), key, word_level_position_docids_name))); + if heap.len() > limit { heap.pop(); } + } + + for result in word_prefix_level_position_docids.remap_data_type::().iter(rtxn)? { + let ((word, level, left, right), value) = result?; + let key = format!("{} {} {:?}", word, level, left..=right); + heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name))); + if heap.len() > limit { heap.pop(); } + } + let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; for (field_id, field_type) in faceted_fields { @@ -524,6 +573,84 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam Ok(wtr.flush()?) } +fn words_level_positions_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + words: Vec, +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; + + for word in words.iter().map(AsRef::as_ref) { + let range = { + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + left..=right + }; + for result in index.word_level_position_docids.range(rtxn, &range)? { + let ((w, level, left, right), docids) = result?; + + let count = docids.len().to_string(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + let position_range = if level == TreeLevel::min_value() { + format!("{:?}", left) + } else { + format!("{:?}", left..=right) + }; + let level = level.to_string(); + wtr.write_record(&[w, &level, &position_range, &count, &docids])?; + } + } + + Ok(wtr.flush()?) +} + +fn word_prefixes_level_positions_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + prefixes: Vec, +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?; + + for word in prefixes.iter().map(AsRef::as_ref) { + let range = { + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + left..=right + }; + for result in index.word_prefix_level_position_docids.range(rtxn, &range)? { + let ((w, level, left, right), docids) = result?; + + let count = docids.len().to_string(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + let position_range = if level == TreeLevel::min_value() { + format!("{:?}", left) + } else { + format!("{:?}", left..=right) + }; + let level = level.to_string(); + wtr.write_record(&[w, &level, &position_range, &count, &docids])?; + } + } + + Ok(wtr.flush()?) +} + fn docids_words_positions( index: &Index, rtxn: &heed::RoTxn, @@ -715,6 +842,21 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> anyhow::Result<()> { use heed::types::ByteSlice; + let Index { + env: _, + main, + word_docids, + word_prefix_docids, + docid_word_positions, + word_pair_proximity_docids, + word_prefix_pair_proximity_docids, + word_level_position_docids, + word_prefix_level_position_docids, + facet_field_id_value_docids, + field_id_docid_facet_values, + documents, + } = index; + let names = if names.is_empty() { ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect() } else { @@ -723,30 +865,35 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a for name in names { let database = match name.as_str() { - MAIN_DB_NAME => &index.main, - WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(), - WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(), - DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), - FACET_FIELD_ID_VALUE_DOCIDS_NAME => index.facet_field_id_value_docids.as_polymorph(), - FIELD_ID_DOCID_FACET_VALUES_NAME => index.field_id_docid_facet_values.as_polymorph(), - DOCUMENTS_DB_NAME => index.documents.as_polymorph(), + MAIN_DB_NAME => &main, + WORD_PREFIX_DOCIDS_DB_NAME => word_prefix_docids.as_polymorph(), + WORD_DOCIDS_DB_NAME => word_docids.as_polymorph(), + DOCID_WORD_POSITIONS_DB_NAME => docid_word_positions.as_polymorph(), + WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_pair_proximity_docids.as_polymorph(), + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), + WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), + WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), + FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(), + DOCUMENTS_DB_NAME => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; let mut key_size: u64 = 0; let mut val_size: u64 = 0; + let mut number_entries: u64 = 0; for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? { let (k, v) = result?; key_size += k.len() as u64; val_size += v.len() as u64; + number_entries += 1; } println!("The {} database weigh:", name); println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true)); println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true)); println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true)); + println!("\tnumber of entries: {}", number_entries); } Ok(()) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b198131c1..8b359a09b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -27,7 +27,7 @@ once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" regex = "1.4.3" -roaring = "0.6.5" +roaring = "0.6.6" serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } slice-group-by = "0.2.6" @@ -52,13 +52,11 @@ logging_timer = "1.0.0" tinytemplate = "=1.1.0" [dev-dependencies] +big_s = "1.0.2" criterion = "0.3.4" maplit = "1.0.2" rand = "0.8.3" -[build-dependencies] -fst = "0.4.5" - [features] default = [] diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index a070c66eb..cc73cdc65 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -2,6 +2,7 @@ mod beu32_str_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; +mod str_level_position_codec; mod str_str_u8_codec; pub mod facet; @@ -9,4 +10,5 @@ pub use self::beu32_str_codec::BEU32StrCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; +pub use self::str_level_position_codec::StrLevelPositionCodec; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs new file mode 100644 index 000000000..810e91940 --- /dev/null +++ b/milli/src/heed_codec/str_level_position_codec.rs @@ -0,0 +1,45 @@ +use std::borrow::Cow; +use std::convert::{TryFrom, TryInto}; +use std::mem::size_of; +use std::str; + +use crate::TreeLevel; + +pub struct StrLevelPositionCodec; + +impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { + type DItem = (&'a str, TreeLevel, u32, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::() + size_of::() * 2; + + if bytes.len() < footer_len { return None } + + let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let word = str::from_utf8(word).ok()?; + + let (level, bytes) = bytes.split_first()?; + let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?; + let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?; + let level = TreeLevel::try_from(*level).ok()?; + + Some((word, level, left, right)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { + type EItem = (&'a str, TreeLevel, u32, u32); + + fn bytes_encode((word, level, left, right): &Self::EItem) -> Option> { + let left = left.to_be_bytes(); + let right = right.to_be_bytes(); + + let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len()); + bytes.extend_from_slice(word.as_bytes()); + bytes.push((*level).into()); + bytes.extend_from_slice(&left[..]); + bytes.extend_from_slice(&right[..]); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 045eabc3c..ba7747250 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -12,7 +12,7 @@ use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; use crate::{ BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, - ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec, + ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, }; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; @@ -52,6 +52,10 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, + /// Maps the word, level and position range with the docids that corresponds to it. + pub word_level_position_docids: Database, + /// Maps the level positions of a word prefix with all the docids where this prefix appears. + pub word_prefix_level_position_docids: Database, /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. pub facet_field_id_value_docids: Database, /// Maps the document id, the facet field id and the globally ordered value. @@ -62,7 +66,7 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(9); + options.max_dbs(11); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -71,6 +75,8 @@ impl Index { let docid_word_positions = env.create_database(Some("docid-word-positions"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; + let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; + let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; @@ -94,6 +100,8 @@ impl Index { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index fe9bd828b..03169bce7 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -9,6 +9,7 @@ pub mod facet; pub mod heed_codec; pub mod index; pub mod proximity; +pub mod tree_level; pub mod update; use std::borrow::Cow; @@ -22,11 +23,12 @@ use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; -pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; +pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; +pub use self::tree_level::TreeLevel; pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 1dc186720..d2841d449 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -31,32 +31,10 @@ pub struct AscDesc<'t> { candidates: Box> + 't>, bucket_candidates: RoaringBitmap, faceted_candidates: RoaringBitmap, - parent: Option>, + parent: Box, } impl<'t> AscDesc<'t> { - pub fn initial_asc( - index: &'t Index, - rtxn: &'t heed::RoTxn, - query_tree: Option, - candidates: Option, - field_name: String, - ) -> anyhow::Result - { - Self::initial(index, rtxn, query_tree, candidates, field_name, true) - } - - pub fn initial_desc( - index: &'t Index, - rtxn: &'t heed::RoTxn, - query_tree: Option, - candidates: Option, - field_name: String, - ) -> anyhow::Result - { - Self::initial(index, rtxn, query_tree, candidates, field_name, false) - } - pub fn asc( index: &'t Index, rtxn: &'t heed::RoTxn, @@ -77,47 +55,6 @@ impl<'t> AscDesc<'t> { Self::new(index, rtxn, parent, field_name, false) } - fn initial( - index: &'t Index, - rtxn: &'t heed::RoTxn, - query_tree: Option, - candidates: Option, - field_name: String, - ascending: bool, - ) -> anyhow::Result - { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields(rtxn)?; - let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; - - let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?; - let candidates = match &query_tree { - Some(qt) => { - let context = CriteriaBuilder::new(rtxn, index)?; - let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), &mut WordDerivationsCache::new())?; - if let Some(candidates) = candidates { - qt_candidates.intersect_with(&candidates); - } - qt_candidates - }, - None => candidates.unwrap_or(faceted_candidates.clone()), - }; - - Ok(AscDesc { - index, - rtxn, - field_name, - field_id, - facet_type, - ascending, - query_tree, - candidates: facet_ordered(index, rtxn, field_id, facet_type, ascending, candidates)?, - faceted_candidates, - bucket_candidates: RoaringBitmap::new(), - parent: None, - }) - } - fn new( index: &'t Index, rtxn: &'t heed::RoTxn, @@ -141,7 +78,7 @@ impl<'t> AscDesc<'t> { candidates: Box::new(std::iter::empty()), faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, }) } } @@ -156,64 +93,56 @@ impl<'t> Criterion for AscDesc<'t> { match self.candidates.next().transpose()? { None => { - let query_tree = self.query_tree.take(); - let bucket_candidates = take(&mut self.bucket_candidates); - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree; - let candidates = match (&self.query_tree, candidates) { - (_, Some(mut candidates)) => { - candidates.intersect_with(&self.faceted_candidates); - candidates - }, - (Some(qt), None) => { - let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; - candidates.intersect_with(&self.faceted_candidates); - candidates - }, - (None, None) => take(&mut self.faceted_candidates), - }; - if bucket_candidates.is_empty() { - self.bucket_candidates.union_with(&candidates); - } else { - self.bucket_candidates.union_with(&bucket_candidates); - } - self.candidates = facet_ordered( - self.index, - self.rtxn, - self.field_id, - self.facet_type, - self.ascending, - candidates, - )?; + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates_is_some = candidates.is_some(); + self.query_tree = query_tree; + let candidates = match (&self.query_tree, candidates) { + (_, Some(mut candidates)) => { + candidates.intersect_with(&self.faceted_candidates); + candidates }, - None => return Ok(None), - } - }, - None => if query_tree.is_none() && bucket_candidates.is_empty() { - return Ok(None) - }, - } + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; + candidates.intersect_with(&self.faceted_candidates); + candidates + }, + (None, None) => take(&mut self.faceted_candidates), + }; - return Ok(Some(CriterionResult { - query_tree, - candidates: Some(RoaringBitmap::new()), - bucket_candidates, - })); + // If our parent returns candidates it means that the bucket + // candidates were already computed before and we can use them. + // + // If not, we must use the just computed candidates as our bucket + // candidates. + if candidates_is_some { + self.bucket_candidates.union_with(&bucket_candidates); + } else { + self.bucket_candidates.union_with(&candidates); + } + + if candidates.is_empty() { + continue; + } + + self.candidates = facet_ordered( + self.index, + self.rtxn, + self.field_id, + self.facet_type, + self.ascending, + candidates, + )?; + }, + None => return Ok(None), + } }, Some(candidates) => { - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, } diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs new file mode 100644 index 000000000..bbbc0de1a --- /dev/null +++ b/milli/src/search/criteria/attribute.rs @@ -0,0 +1,737 @@ +use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap}; +use std::collections::{BTreeMap, HashMap, btree_map}; +use std::collections::binary_heap::PeekMut; +use std::mem::take; + +use roaring::RoaringBitmap; + +use crate::{TreeLevel, search::build_dfa}; +use crate::search::criteria::Query; +use crate::search::query_tree::{Operation, QueryKind}; +use crate::search::{word_derivations, WordDerivationsCache}; +use super::{Criterion, CriterionResult, Context, resolve_query_tree}; + +/// To be able to divide integers by the number of words in the query +/// we want to find a multiplier that allow us to divide by any number between 1 and 10. +/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). +const LCM_10_FIRST_NUMBERS: u32 = 2520; + +/// To compute the interval size of a level, +/// we use 4 as the exponentiation base and the level as the exponent. +const LEVEL_EXPONENTIATION_BASE: u32 = 4; + +pub struct Attribute<'t> { + ctx: &'t dyn Context<'t>, + query_tree: Option, + candidates: Option, + bucket_candidates: RoaringBitmap, + parent: Box, + flattened_query_tree: Option>>>, + current_buckets: Option>, +} + +impl<'t> Attribute<'t> { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { + Attribute { + ctx, + query_tree: None, + candidates: None, + bucket_candidates: RoaringBitmap::new(), + parent, + flattened_query_tree: None, + current_buckets: None, + } + } +} + +impl<'t> Criterion for Attribute<'t> { + #[logging_timer::time("Attribute::{}")] + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + loop { + match (&self.query_tree, &mut self.candidates) { + (_, Some(candidates)) if candidates.is_empty() => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + (Some(qt), Some(candidates)) => { + let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| { + flatten_query_tree(&qt) + }); + + let found_candidates = if candidates.len() < 1000 { + let current_buckets = match self.current_buckets.as_mut() { + Some(current_buckets) => current_buckets, + None => { + let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; + self.current_buckets.get_or_insert(new_buckets.into_iter()) + }, + }; + + match current_buckets.next() { + Some((_score, candidates)) => candidates, + None => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + } + } else { + match set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? { + Some(candidates) => candidates, + None => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + } + }; + + candidates.difference_with(&found_candidates); + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: Some(found_candidates), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + (Some(qt), None) => { + let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; + self.bucket_candidates |= &query_tree_candidates; + self.candidates = Some(query_tree_candidates); + }, + (None, Some(_)) => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + (None, None) => { + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree; + self.candidates = candidates; + self.bucket_candidates |= bucket_candidates; + self.flattened_query_tree = None; + self.current_buckets = None; + }, + None => return Ok(None), + } + }, + } + } + } +} + +/// WordLevelIterator is an pseudo-Iterator over intervals of word-position for one word, +/// it will begin at the first non-empty interval and will return every interval without +/// jumping over empty intervals. +struct WordLevelIterator<'t, 'q> { + inner: Box> + 't>, + level: TreeLevel, + interval_size: u32, + word: Cow<'q, str>, + in_prefix_cache: bool, + inner_next: Option<(u32, u32, RoaringBitmap)>, + current_interval: Option<(u32, u32)>, +} + +impl<'t, 'q> WordLevelIterator<'t, 'q> { + fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result> { + match ctx.word_position_last_level(&word, in_prefix_cache)? { + Some(level) => { + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level.clone()) as u32); + let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; + Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) + }, + None => Ok(None), + } + } + + fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option) -> heed::Result { + let level = level.min(&self.level).clone(); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level.clone()) as u32); + let word = self.word.clone(); + let in_prefix_cache = self.in_prefix_cache; + let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; + + Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) + } + + fn next(&mut self) -> heed::Result> { + fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left } + + let inner_next = match self.inner_next.take() { + Some(inner_next) => Some(inner_next), + None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)), + }; + + match inner_next { + Some((left, right, docids)) => { + match self.current_interval { + Some((last_left, last_right)) if !is_next_interval(last_right, left) => { + let blank_left = last_left + self.interval_size; + let blank_right = last_right + self.interval_size; + self.current_interval = Some((blank_left, blank_right)); + self.inner_next = Some((left, right, docids)); + Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) + }, + _ => { + self.current_interval = Some((left, right)); + Ok(Some((left, right, docids))) + } + } + }, + None => Ok(None), + } + } +} + +/// QueryLevelIterator is an pseudo-Iterator for a Query, +/// It contains WordLevelIterators and is chainned with other QueryLevelIterator. +struct QueryLevelIterator<'t, 'q> { + parent: Option>>, + inner: Vec>, + level: TreeLevel, + accumulator: Vec>, + parent_accumulator: Vec>, + interval_to_skip: usize, +} + +impl<'t, 'q> QueryLevelIterator<'t, 'q> { + fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + let mut inner = Vec::with_capacity(queries.len()); + for query in queries { + match &query.kind { + QueryKind::Exact { word, .. } => { + if !query.prefix || ctx.in_prefix_cache(&word) { + let word = Cow::Borrowed(query.kind.word()); + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? { + inner.push(word_level_iterator); + } + } else { + for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { + let word = Cow::Owned(word.to_owned()); + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { + inner.push(word_level_iterator); + } + } + } + }, + QueryKind::Tolerant { typo, word } => { + for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { + let word = Cow::Owned(word.to_owned()); + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { + inner.push(word_level_iterator); + } + } + } + } + } + + let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone()); + match highest { + Some(level) => Ok(Some(Self { + parent: None, + inner, + level, + accumulator: vec![], + parent_accumulator: vec![], + interval_to_skip: 0, + })), + None => Ok(None), + } + } + + fn parent(&mut self, parent: QueryLevelIterator<'t, 'q>) -> &Self { + self.parent = Some(Box::new(parent)); + self + } + + /// create a new QueryLevelIterator with a lower level than the current one. + fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result { + let (level, parent) = match &self.parent { + Some(parent) => { + let parent = parent.dig(ctx)?; + (parent.level.min(self.level), Some(Box::new(parent))) + }, + None => (self.level.saturating_sub(1), None), + }; + + let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten(); + let mut inner = Vec::with_capacity(self.inner.len()); + for word_level_iterator in self.inner.iter() { + inner.push(word_level_iterator.dig(ctx, &level, left_interval)?); + } + + Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0}) + } + + fn inner_next(&mut self, level: TreeLevel) -> heed::Result> { + let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None; + let u8_level = Into::::into(level); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); + for wli in self.inner.iter_mut() { + let wli_u8_level = Into::::into(wli.level.clone()); + let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); + for _ in 0..accumulated_count { + if let Some((next_left, _, next_docids)) = wli.next()? { + accumulated = match accumulated.take(){ + Some((acc_left, acc_right, mut acc_docids)) => { + acc_docids |= next_docids; + Some((acc_left, acc_right, acc_docids)) + }, + None => Some((next_left, next_left + interval_size, next_docids)), + }; + } + } + } + + Ok(accumulated) + } + + /// return the next meta-interval created from inner WordLevelIterators, + /// and from eventual chainned QueryLevelIterator. + fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result> { + let parent_result = match self.parent.as_mut() { + Some(parent) => Some(parent.next(allowed_candidates, tree_level)?), + None => None, + }; + + match parent_result { + Some(parent_next) => { + let inner_next = self.inner_next(tree_level)?; + self.interval_to_skip += interval_to_skip( + &self.parent_accumulator, + &self.accumulator, + self.interval_to_skip, + allowed_candidates + ); + self.accumulator.push(inner_next); + self.parent_accumulator.push(parent_next); + let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; + + for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) { + if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { + match merged_interval.as_mut() { + Some((_, _, merged_docids)) => *merged_docids |= a & b, + None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)), + } + } + } + Ok(merged_interval) + }, + None => { + let level = self.level; + match self.inner_next(level)? { + Some((left, right, mut candidates)) => { + self.accumulator = vec![Some((left, right, RoaringBitmap::new()))]; + candidates &= allowed_candidates; + Ok(Some((left, right, candidates))) + + }, + None => { + self.accumulator = vec![None]; + Ok(None) + }, + } + } + } + } +} + +/// Count the number of interval that can be skiped when we make the cross-intersections +/// in order to compute the next meta-interval. +/// A pair of intervals is skiped when both intervals doesn't contain any allowed docids. +fn interval_to_skip( + parent_accumulator: &[Option<(u32, u32, RoaringBitmap)>], + current_accumulator: &[Option<(u32, u32, RoaringBitmap)>], + already_skiped: usize, + allowed_candidates: &RoaringBitmap, +) -> usize { + parent_accumulator.into_iter() + .zip(current_accumulator.into_iter()) + .skip(already_skiped) + .take_while(|(parent, current)| { + let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); + let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); + skip_parent && skip_current + }) + .count() + +} + +/// A Branch is represent a possible alternative of the original query and is build with the Query Tree, +/// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates. +struct Branch<'t, 'q> { + query_level_iterator: QueryLevelIterator<'t, 'q>, + last_result: (u32, u32, RoaringBitmap), + tree_level: TreeLevel, + branch_size: u32, +} + +impl<'t, 'q> Branch<'t, 'q> { + /// return the next meta-interval of the branch, + /// and update inner interval in order to be ranked by the BinaryHeap. + fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result { + let tree_level = self.query_level_iterator.level; + match self.query_level_iterator.next(allowed_candidates, tree_level)? { + Some(last_result) => { + self.last_result = last_result; + self.tree_level = tree_level; + Ok(true) + }, + None => Ok(false), + } + } + + /// make the current Branch iterate over smaller intervals. + fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> { + self.query_level_iterator = self.query_level_iterator.dig(ctx)?; + Ok(()) + } + + /// because next() method could be time consuming, + /// update inner interval in order to be ranked by the binary_heap without computing it, + /// the next() method should be called when the real interval is needed. + fn lazy_next(&mut self) { + let u8_level = Into::::into(self.tree_level.clone()); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); + let (left, right, _) = self.last_result; + + self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); + } + + /// return the score of the current inner interval. + fn compute_rank(&self) -> u32 { + // we compute a rank from the left interval. + let (left, _, _) = self.last_result; + left.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size + } + + fn cmp(&self, other: &Self) -> Ordering { + let self_rank = self.compute_rank(); + let other_rank = other.compute_rank(); + let left_cmp = self_rank.cmp(&other_rank).reverse(); + // on level: lower is better, + // we want to dig faster into levels on interesting branches. + let level_cmp = self.tree_level.cmp(&other.tree_level).reverse(); + + left_cmp.then(level_cmp).then(self.last_result.2.len().cmp(&other.last_result.2.len())) + } +} + +impl<'t, 'q> Ord for Branch<'t, 'q> { + fn cmp(&self, other: &Self) -> Ordering { + self.cmp(other) + } +} + +impl<'t, 'q> PartialOrd for Branch<'t, 'q> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'t, 'q> PartialEq for Branch<'t, 'q> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl<'t, 'q> Eq for Branch<'t, 'q> {} + +fn initialize_query_level_iterators<'t, 'q>( + ctx: &'t dyn Context<'t>, + branches: &'q Vec>>, + allowed_candidates: &RoaringBitmap, + wdcache: &mut WordDerivationsCache, +) -> anyhow::Result>> { + + let mut positions = BinaryHeap::with_capacity(branches.len()); + for branch in branches { + let mut branch_positions = Vec::with_capacity(branch.len()); + for queries in branch { + match QueryLevelIterator::new(ctx, queries, wdcache)? { + Some(qli) => branch_positions.push(qli), + None => { + // the branch seems to be invalid, so we skip it. + branch_positions.clear(); + break; + }, + } + } + // QueryLevelIterator need to be sorted by level and folded in descending order. + branch_positions.sort_unstable_by_key(|qli| qli.level); + let folded_query_level_iterators = branch_positions + .into_iter() + .fold(None, |fold: Option, mut qli| match fold { + Some(fold) => { + qli.parent(fold); + Some(qli) + }, + None => Some(qli), + }); + + if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { + let tree_level = folded_query_level_iterators.level; + let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?; + if let Some(last_result) = last_result { + let branch = Branch { + last_result, + tree_level, + query_level_iterator: folded_query_level_iterators, + branch_size: branch.len() as u32, + }; + positions.push(branch); + } + } + } + + Ok(positions) +} + +fn set_compute_candidates<'t>( + ctx: &'t dyn Context<'t>, + branches: &Vec>>, + allowed_candidates: &RoaringBitmap, + wdcache: &mut WordDerivationsCache, +) -> anyhow::Result> +{ + let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; + let lowest_level = TreeLevel::min_value(); + let mut final_candidates: Option<(u32, RoaringBitmap)> = None; + let mut allowed_candidates = allowed_candidates.clone(); + + while let Some(mut branch) = branches_heap.peek_mut() { + let is_lowest_level = branch.tree_level == lowest_level; + let branch_rank = branch.compute_rank(); + // if current is worst than best we break to return + // candidates that correspond to the best rank + if let Some((best_rank, _)) = final_candidates { + if branch_rank > best_rank { break } + } + let _left = branch.last_result.0; + let candidates = take(&mut branch.last_result.2); + if candidates.is_empty() { + // we don't have candidates, get next interval. + if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } + } + else if is_lowest_level { + // we have candidates, but we can't dig deeper. + allowed_candidates -= &candidates; + final_candidates = match final_candidates.take() { + // we add current candidates to best candidates + Some((best_rank, mut best_candidates)) => { + best_candidates |= candidates; + branch.lazy_next(); + Some((best_rank, best_candidates)) + }, + // we take current candidates as best candidates + None => { + branch.lazy_next(); + Some((branch_rank, candidates)) + }, + }; + } else { + // we have candidates, lets dig deeper in levels. + branch.dig(ctx)?; + if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } + } + + } + + Ok(final_candidates.map(|(_rank, candidates)| candidates)) +} + +fn linear_compute_candidates( + ctx: &dyn Context, + branches: &Vec>>, + allowed_candidates: &RoaringBitmap, +) -> anyhow::Result> +{ + fn compute_candidate_rank(branches: &Vec>>, words_positions: HashMap) -> u64 { + let mut min_rank = u64::max_value(); + for branch in branches { + + let branch_len = branch.len(); + let mut branch_rank = Vec::with_capacity(branch_len); + for derivates in branch { + let mut position = None; + for Query { prefix, kind } in derivates { + // find the best position of the current word in the document. + let current_position = match kind { + QueryKind::Exact { word, .. } => { + if *prefix { + word_derivations(word, true, 0, &words_positions) + .flat_map(|positions| positions.iter().next()).min() + } else { + words_positions.get(word) + .map(|positions| positions.iter().next()) + .flatten() + } + }, + QueryKind::Tolerant { typo, word } => { + word_derivations(word, *prefix, *typo, &words_positions) + .flat_map(|positions| positions.iter().next()).min() + }, + }; + + match (position, current_position) { + (Some(p), Some(cp)) => position = Some(cmp::min(p, cp)), + (None, Some(cp)) => position = Some(cp), + _ => (), + } + } + + // if a position is found, we add it to the branch score, + // otherwise the branch is considered as unfindable in this document and we break. + if let Some(position) = position { + branch_rank.push(position as u64); + } else { + branch_rank.clear(); + break; + } + } + + if !branch_rank.is_empty() { + branch_rank.sort_unstable(); + // because several words in same query can't match all a the position 0, + // we substract the word index to the position. + let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); + // here we do the means of the words of the branch + min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); + } + } + + min_rank + } + + fn word_derivations<'a>( + word: &str, + is_prefix: bool, + max_typo: u8, + words_positions: &'a HashMap, + ) -> impl Iterator + { + let dfa = build_dfa(word, max_typo, is_prefix); + words_positions.iter().filter_map(move |(document_word, positions)| { + use levenshtein_automata::Distance; + match dfa.eval(document_word) { + Distance::Exact(_) => Some(positions), + Distance::AtLeast(_) => None, + } + }) + } + + let mut candidates = BTreeMap::new(); + for docid in allowed_candidates { + let words_positions = ctx.docid_words_positions(docid)?; + let rank = compute_candidate_rank(branches, words_positions); + candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid); + } + + Ok(candidates) +} + +// TODO can we keep refs of Query +fn flatten_query_tree(query_tree: &Operation) -> Vec>> { + use crate::search::criteria::Operation::{And, Or, Consecutive}; + + fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec>> { + match tail.split_first() { + Some((thead, tail)) => { + let tail = and_recurse(thead, tail); + let mut out = Vec::new(); + for array in recurse(head) { + for tail_array in &tail { + let mut array = array.clone(); + array.extend(tail_array.iter().cloned()); + out.push(array); + } + } + out + }, + None => recurse(head), + } + } + + fn recurse(op: &Operation) -> Vec>> { + match op { + And(ops) | Consecutive(ops) => { + ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) + }, + Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { + vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] + } else { + ops.into_iter().map(recurse).flatten().collect() + }, + Operation::Query(query) => vec![vec![vec![query.clone()]]], + } + } + + recurse(query_tree) +} + +#[cfg(test)] +mod tests { + use big_s::S; + + use crate::search::criteria::QueryKind; + use super::*; + + #[test] + fn simple_flatten_query_tree() { + let query_tree = Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), + ]), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), + ]), + ]), + ]), + ]); + + let expected = vec![ + vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], + vec![ + vec![Query { prefix: false, kind: QueryKind::exact(S("manythe")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], + ], + vec![ + vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("thefish")) }], + ], + vec![ + vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("the")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], + ], + ]; + + let result = flatten_query_tree(&query_tree); + assert_eq!(expected, result); + } +} diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs deleted file mode 100644 index fa204bdf2..000000000 --- a/milli/src/search/criteria/fetcher.rs +++ /dev/null @@ -1,135 +0,0 @@ -use std::collections::HashMap; -use std::mem::take; - -use log::debug; -use roaring::RoaringBitmap; - -use crate::search::query_tree::Operation; -use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; - -/// The result of a call to the fetcher. -#[derive(Debug, Clone, PartialEq)] -pub struct FetcherResult { - /// The query tree corresponding to the current bucket of the last criterion. - pub query_tree: Option, - /// The candidates of the current bucket of the last criterion. - pub candidates: RoaringBitmap, - /// Candidates that comes from the current bucket of the initial criterion. - pub bucket_candidates: RoaringBitmap, -} - -pub struct Fetcher<'t> { - ctx: &'t dyn Context, - query_tree: Option, - candidates: Candidates, - parent: Option>, - should_get_documents_ids: bool, - wdcache: WordDerivationsCache, -} - -impl<'t> Fetcher<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Fetcher { - ctx, - query_tree, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - parent: None, - should_get_documents_ids: true, - wdcache: WordDerivationsCache::new(), - } - } - - pub fn new( - ctx: &'t dyn Context, - parent: Box, - ) -> Self - { - Fetcher { - ctx, - query_tree: None, - candidates: Candidates::default(), - parent: Some(parent), - should_get_documents_ids: true, - wdcache: WordDerivationsCache::new(), - } - } - - #[logging_timer::time("Fetcher::{}")] - pub fn next(&mut self) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; - loop { - debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", - self.should_get_documents_ids, self.candidates, - ); - - let should_get_documents_ids = take(&mut self.should_get_documents_ids); - match &mut self.candidates { - Allowed(_) => { - let candidates = take(&mut self.candidates).into_inner(); - let candidates = match &self.query_tree { - Some(qt) if should_get_documents_ids => { - let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?; - docids.intersect_with(&candidates); - docids - }, - _ => candidates, - }; - - return Ok(Some(FetcherResult { - query_tree: self.query_tree.take(), - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - Forbidden(_) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(&mut self.wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, - (None, None) => RoaringBitmap::new(), - }; - - return Ok(Some(FetcherResult { query_tree, candidates, bucket_candidates })) - }, - None => if should_get_documents_ids { - let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, - None => self.ctx.documents_ids()?, - }; - - return Ok(Some(FetcherResult { - query_tree: self.query_tree.clone(), - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - } - }, - None => if should_get_documents_ids { - let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, - None => self.ctx.documents_ids()?, - }; - - return Ok(Some(FetcherResult { - query_tree: self.query_tree.clone(), - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - } - return Ok(None); - }, - } - } - } -} diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs new file mode 100644 index 000000000..f8bc43204 --- /dev/null +++ b/milli/src/search/criteria/final.rs @@ -0,0 +1,53 @@ +use std::collections::HashMap; + +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; +use super::{resolve_query_tree, Criterion, CriterionResult, Context}; + +/// The result of a call to the fetcher. +#[derive(Debug, Clone, PartialEq)] +pub struct FinalResult { + /// The query tree corresponding to the current bucket of the last criterion. + pub query_tree: Option, + /// The candidates of the current bucket of the last criterion. + pub candidates: RoaringBitmap, + /// Candidates that comes from the current bucket of the initial criterion. + pub bucket_candidates: RoaringBitmap, +} + +pub struct Final<'t> { + ctx: &'t dyn Context<'t>, + parent: Box, + wdcache: WordDerivationsCache, +} + +impl<'t> Final<'t> { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Final<'t> { + Final { ctx, parent, wdcache: WordDerivationsCache::new() } + } + + #[logging_timer::time("Final::{}")] + pub fn next(&mut self) -> anyhow::Result> { + loop { + debug!("Final iteration"); + + match self.parent.next(&mut self.wdcache)? { + Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => { + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, + (None, None) => self.ctx.documents_ids()?, + }; + + bucket_candidates.union_with(&candidates); + + return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); + }, + None => return Ok(None), + } + } + } +} diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs new file mode 100644 index 000000000..d4b9e1379 --- /dev/null +++ b/milli/src/search/criteria/initial.rs @@ -0,0 +1,28 @@ +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; + +use super::{Criterion, CriterionResult}; + +pub struct Initial { + answer: Option +} + +impl Initial { + pub fn new(query_tree: Option, mut candidates: Option) -> Initial { + let answer = CriterionResult { + query_tree, + candidates: candidates.clone(), + bucket_candidates: candidates.take().unwrap_or_default(), + }; + Initial { answer: Some(answer) } + } +} + +impl Criterion for Initial { + #[logging_timer::time("Initial::{}")] + fn next(&mut self, _: &mut WordDerivationsCache) -> anyhow::Result> { + Ok(self.answer.take()) + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 22f081871..01af1ffbd 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -4,21 +4,25 @@ use std::borrow::Cow; use anyhow::bail; use roaring::RoaringBitmap; -use crate::search::{word_derivations, WordDerivationsCache}; +use crate::{TreeLevel, search::{word_derivations, WordDerivationsCache}}; use crate::{Index, DocumentId}; use super::query_tree::{Operation, Query, QueryKind}; +use self::asc_desc::AscDesc; +use self::attribute::Attribute; +use self::r#final::Final; +use self::initial::Initial; +use self::proximity::Proximity; use self::typo::Typo; use self::words::Words; -use self::asc_desc::AscDesc; -use self::proximity::Proximity; -use self::fetcher::Fetcher; +mod asc_desc; +mod attribute; +mod initial; +mod proximity; mod typo; mod words; -mod asc_desc; -mod proximity; -pub mod fetcher; +pub mod r#final; pub trait Criterion { fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result>; @@ -59,7 +63,8 @@ impl Default for Candidates { Self::Forbidden(RoaringBitmap::new()) } } -pub trait Context { + +pub trait Context<'c> { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; @@ -68,6 +73,8 @@ pub trait Context { fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; fn docid_words_positions(&self, docid: DocumentId) -> heed::Result>; + fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>>; + fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; } pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -76,7 +83,7 @@ pub struct CriteriaBuilder<'t> { words_prefixes_fst: fst::Set>, } -impl<'a> Context for CriteriaBuilder<'a> { +impl<'c> Context<'c> for CriteriaBuilder<'c> { fn documents_ids(&self) -> heed::Result { self.index.documents_ids(self.rtxn) } @@ -115,6 +122,48 @@ impl<'a> Context for CriteriaBuilder<'a> { } Ok(words_positions) } + + fn word_position_iterator( + &self, + word: &str, + level: TreeLevel, + in_prefix_cache: bool, + left: Option, + right: Option + ) -> heed::Result> + 'c>> + { + let range = { + let left = left.unwrap_or(u32::min_value()); + let right = right.unwrap_or(u32::max_value()); + let left = (word, level, left, left); + let right = (word, level, right, right); + left..=right + }; + let db = match in_prefix_cache { + true => self.index.word_prefix_level_position_docids, + false => self.index.word_level_position_docids, + }; + + Ok(Box::new(db.range(self.rtxn, &range)?)) + } + + fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result> { + let range = { + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + left..=right + }; + let db = match in_prefix_cache { + true => self.index.word_prefix_level_position_docids, + false => self.index.word_level_position_docids, + }; + let last_level = db + .remap_data_type::() + .range(self.rtxn, &range)?.last().transpose()? + .map(|((_, level, _, _), _)| level); + + Ok(last_level) + } } impl<'t> CriteriaBuilder<'t> { @@ -126,42 +175,26 @@ impl<'t> CriteriaBuilder<'t> { pub fn build( &'t self, - mut query_tree: Option, - mut facet_candidates: Option, - ) -> anyhow::Result> + query_tree: Option, + facet_candidates: Option, + ) -> anyhow::Result> { use crate::criterion::Criterion as Name; - let mut criterion = None as Option>; + let mut criterion = Box::new(Initial::new(query_tree, facet_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { - criterion = Some(match criterion.take() { - Some(father) => match name { - Name::Typo => Box::new(Typo::new(self, father)), - Name::Words => Box::new(Words::new(self, father)), - Name::Proximity => Box::new(Proximity::new(self, father)), - Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?), - Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?), - _otherwise => father, - }, - None => match name { - Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())), - Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())), - Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())), - Name::Asc(field) => { - Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) - }, - Name::Desc(field) => { - Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) - }, - _otherwise => continue, - }, - }); + criterion = match name { + Name::Typo => Box::new(Typo::new(self, criterion)), + Name::Words => Box::new(Words::new(self, criterion)), + Name::Proximity => Box::new(Proximity::new(self, criterion)), + Name::Attribute => Box::new(Attribute::new(self, criterion)), + Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), + Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), + _otherwise => criterion, + }; } - match criterion { - Some(criterion) => Ok(Fetcher::new(self, criterion)), - None => Ok(Fetcher::initial(self, query_tree, facet_candidates)), - } + Ok(Final::new(self, criterion)) } } @@ -362,9 +395,10 @@ pub mod test { word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + docid_words: HashMap>, } - impl<'a> Context for TestContext<'a> { + impl<'c> Context<'c> for TestContext<'c> { fn documents_ids(&self) -> heed::Result { Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) } @@ -395,7 +429,24 @@ pub mod test { self.word_prefix_docids.contains_key(&word.to_string()) } - fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result> { + fn docid_words_positions(&self, docid: DocumentId) -> heed::Result> { + if let Some(docid_words) = self.docid_words.get(&docid) { + Ok(docid_words + .iter() + .enumerate() + .map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))) + .collect() + ) + } else { + Ok(HashMap::new()) + } + } + + fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option, _right: Option) -> heed::Result> + 'c>> { + todo!() + } + + fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result> { todo!() } } @@ -431,50 +482,58 @@ pub mod test { s("morning") => random_postings(rng, 125), }; + let mut docid_words = HashMap::new(); + for (word, docids) in word_docids.iter() { + for docid in docids { + let words = docid_words.entry(docid).or_insert(vec![]); + words.push(word.clone()); + } + } + let word_prefix_docids = hashmap!{ s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], }; - let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")]; - let hello_world_split = (hello_world.len() / 2) as usize; - let hello_world_1 = hello_world.iter().take(hello_world_split).collect(); - let hello_world_2 = hello_world.iter().skip(hello_world_split).collect(); - - let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")]; - let hello_word_split = (hello_word.len() / 2) as usize; - let hello_word_4 = hello_word.iter().take(hello_word_split).collect(); - let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect(); - let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect(); - let word_pair_proximity_docids = hashmap!{ - (s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")], - (s("hello"), s("world"), 1) => hello_world_1, - (s("hello"), s("world"), 4) => hello_world_2, - (s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")], - (s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], - (s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), - (s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], - (s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), - (s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")], - (s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")], - (s("hello"), s("word"), 4) => hello_word_4, - (s("hello"), s("word"), 6) => hello_word_6, - (s("hello"), s("word"), 7) => hello_word_7, - (s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")], - (s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], - (s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")], - (s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], - }; - - let word_prefix_pair_proximity_docids = hashmap!{ - (s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(), - (s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(), - (s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(), - (s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(), - (s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(), - (s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(), - }; + let mut word_pair_proximity_docids = HashMap::new(); + let mut word_prefix_pair_proximity_docids = HashMap::new(); + for (lword, lcandidates) in &word_docids { + for (rword, rcandidates) in &word_docids { + if lword == rword { continue } + let candidates = lcandidates & rcandidates; + for candidate in candidates { + if let Some(docid_words) = docid_words.get(&candidate) { + let lposition = docid_words.iter().position(|w| w == lword).unwrap(); + let rposition = docid_words.iter().position(|w| w == rword).unwrap(); + let key = if lposition < rposition { + (s(lword), s(rword), (rposition - lposition) as i32) + } else { + (s(lword), s(rword), (lposition - rposition + 1) as i32) + }; + let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); + docids.push(candidate); + } + } + } + for (pword, pcandidates) in &word_prefix_docids { + if lword.starts_with(pword) { continue } + let candidates = lcandidates & pcandidates; + for candidate in candidates { + if let Some(docid_words) = docid_words.get(&candidate) { + let lposition = docid_words.iter().position(|w| w == lword).unwrap(); + let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); + let key = if lposition < rposition { + (s(lword), s(pword), (rposition - lposition) as i32) + } else { + (s(lword), s(pword), (lposition - rposition + 1) as i32) + }; + let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); + docids.push(candidate); + } + } + } + } let mut keys = word_docids.keys().collect::>(); keys.sort_unstable(); @@ -486,6 +545,7 @@ pub mod test { word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + docid_words, } } } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index decd4c338..4c73d7459 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -8,48 +8,29 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; +use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; pub struct Proximity<'t> { - ctx: &'t dyn Context, - query_tree: Option<(usize, Operation)>, + ctx: &'t dyn Context<'t>, + /// ((max_proximity, query_tree), allowed_candidates) + state: Option<(Option<(usize, Operation)>, RoaringBitmap)>, proximity: u8, - candidates: Candidates, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, candidates_cache: Cache, plane_sweep_cache: Option>, } impl<'t> Proximity<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Proximity { ctx, - query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), + state: None, proximity: 0, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), bucket_candidates: RoaringBitmap::new(), - parent: None, - candidates_cache: Cache::new(), - plane_sweep_cache: None, - } - } - - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { - Proximity { - ctx, - query_tree: None, - proximity: 0, - candidates: Candidates::default(), - bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, candidates_cache: Cache::new(), plane_sweep_cache: None, } @@ -59,27 +40,20 @@ impl<'t> Proximity<'t> { impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; loop { - debug!("Proximity at iteration {} (max {:?}) ({:?})", + debug!("Proximity at iteration {} (max prox {:?}) ({:?})", self.proximity, - self.query_tree.as_ref().map(|(mp, _)| mp), - self.candidates, + self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)), + self.state.as_ref().map(|(_, cd)| cd), ); - match (&mut self.query_tree, &mut self.candidates) { - (_, Allowed(candidates)) if candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: Some(take(&mut self.candidates).into_inner()), - bucket_candidates: take(&mut self.bucket_candidates), - })); + match &mut self.state { + Some((_, candidates)) if candidates.is_empty() => { + self.state = None; // reset state }, - (Some((max_prox, query_tree)), Allowed(candidates)) => { + Some((Some((max_prox, query_tree)), candidates)) => { if self.proximity as usize > *max_prox { - // reset state to (None, Forbidden(_)) - self.query_tree = None; - self.candidates = Candidates::default(); + self.state = None; // reset state } else { let mut new_candidates = if candidates.len() <= 1000 { if let Some(cache) = self.plane_sweep_cache.as_mut() { @@ -89,9 +63,7 @@ impl<'t> Criterion for Proximity<'t> { candidates }, None => { - // reset state to (None, Forbidden(_)) - self.query_tree = None; - self.candidates = Candidates::default(); + self.state = None; // reset state continue }, } @@ -120,79 +92,54 @@ impl<'t> Criterion for Proximity<'t> { candidates.difference_with(&new_candidates); self.proximity += 1; - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), candidates: Some(new_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); } }, - (Some((max_prox, query_tree)), Forbidden(candidates)) => { - if self.proximity as usize > *max_prox { - self.query_tree = None; - self.candidates = Candidates::default(); - } else { - let mut new_candidates = resolve_candidates( - self.ctx, - &query_tree, - self.proximity, - &mut self.candidates_cache, - wdcache, - )?; - - new_candidates.difference_with(&candidates); - candidates.union_with(&new_candidates); - self.proximity += 1; - - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; - - return Ok(Some(CriterionResult { - query_tree: Some(query_tree.clone()), - candidates: Some(new_candidates), - bucket_candidates, - })); - } - }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); + Some((None, candidates)) => { + let candidates = take(candidates); + self.state = None; // reset state return Ok(Some(CriterionResult { query_tree: None, candidates: Some(candidates.clone()), bucket_candidates: candidates, })); }, - (None, Forbidden(_)) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, - (None, None) => RoaringBitmap::new(), - }; + None => { + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates_is_some = candidates.is_some(); + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, + (None, None) => RoaringBitmap::new(), + }; - if bucket_candidates.is_empty() { - self.bucket_candidates.union_with(&candidates); - } else { - self.bucket_candidates.union_with(&bucket_candidates); - } - - self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); - self.proximity = 0; - self.candidates = Candidates::Allowed(candidates); - self.plane_sweep_cache = None; - }, - None => return Ok(None), + // If our parent returns candidates it means that the bucket + // candidates were already computed before and we can use them. + // + // If not, we must use the just computed candidates as our bucket + // candidates. + if candidates_is_some { + self.bucket_candidates.union_with(&bucket_candidates); + } else { + self.bucket_candidates.union_with(&candidates); } + + let query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); + self.state = Some((query_tree, candidates)); + self.proximity = 0; + self.plane_sweep_cache = None; }, None => return Ok(None), } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 3877f53ed..5a3c93ac8 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -9,41 +9,24 @@ use crate::search::{word_derivations, WordDerivationsCache}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; pub struct Typo<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, query_tree: Option<(usize, Operation)>, number_typos: u8, candidates: Candidates, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } impl<'t> Typo<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Typo { - ctx, - query_tree: query_tree.map(|op| (maximum_typo(&op), op)), - number_typos: 0, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - bucket_candidates: RoaringBitmap::new(), - parent: None, - candidates_cache: HashMap::new(), - } - } - - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Typo { ctx, query_tree: None, number_typos: 0, candidates: Candidates::default(), bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, candidates_cache: HashMap::new(), } } @@ -90,15 +73,10 @@ impl<'t> Criterion for Typo<'t> { candidates.difference_with(&new_candidates); self.number_typos += 1; - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: Some(new_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); } }, @@ -145,17 +123,19 @@ impl<'t> Criterion for Typo<'t> { })); }, (None, Forbidden(_)) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); - self.number_typos = 0; - self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); - self.bucket_candidates.union_with(&bucket_candidates); - }, - None => return Ok(None), - } + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); + self.number_typos = 0; + self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } @@ -334,8 +314,8 @@ fn resolve_candidates<'t>( #[cfg(test)] mod test { - use super::*; + use super::super::initial::Initial; use super::super::test::TestContext; #[test] @@ -345,8 +325,10 @@ mod test { let facet_candidates = None; let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, query_tree, facet_candidates); + let parent = Initial::new(query_tree, facet_candidates); + let mut criteria = Typo::new(&context, Box::new(parent)); + assert!(criteria.next(&mut wdcache).unwrap().unwrap().candidates.is_none()); assert!(criteria.next(&mut wdcache).unwrap().is_none()); } @@ -364,7 +346,8 @@ mod test { let facet_candidates = None; let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates); + let parent = Initial::new(Some(query_tree), facet_candidates); + let mut criteria = Typo::new(&context, Box::new(parent)); let candidates_1 = context.word_docids("split").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap() @@ -413,7 +396,8 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())); + let parent = Initial::new(query_tree, Some(facet_candidates.clone())); + let mut criteria = Typo::new(&context, Box::new(parent)); let expected = CriterionResult { query_tree: None, @@ -442,7 +426,8 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())); + let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone())); + let mut criteria = Typo::new(&context, Box::new(parent)); let candidates_1 = context.word_docids("split").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap() @@ -456,7 +441,7 @@ mod test { ]), ])), candidates: Some(&candidates_1 & &facet_candidates), - bucket_candidates: candidates_1 & &facet_candidates, + bucket_candidates: facet_candidates.clone(), }; assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); @@ -478,7 +463,7 @@ mod test { ]), ])), candidates: Some(&candidates_2 & &facet_candidates), - bucket_candidates: candidates_2 & &facet_candidates, + bucket_candidates: RoaringBitmap::new(), }; assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 0aa3b483a..047b3c5f0 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -8,38 +8,22 @@ use crate::search::query_tree::Operation; use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache}; pub struct Words<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, query_trees: Vec, candidates: Option, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } impl<'t> Words<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Words { - ctx, - query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), - candidates, - bucket_candidates: RoaringBitmap::new(), - parent: None, - candidates_cache: HashMap::default(), - } - } - - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Words { ctx, query_trees: Vec::default(), candidates: None, bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, candidates_cache: HashMap::default(), } } @@ -65,27 +49,17 @@ impl<'t> Criterion for Words<'t> { found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => found_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: Some(qt), candidates: Some(found_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, (Some(qt), None) => { - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => RoaringBitmap::new(), - }; - return Ok(Some(CriterionResult { query_tree: Some(qt), candidates: None, - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, (None, Some(_)) => { @@ -97,16 +71,18 @@ impl<'t> Criterion for Words<'t> { })); }, (None, None) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); - self.candidates = candidates; - self.bucket_candidates.union_with(&bucket_candidates); - }, - None => return Ok(None), - } + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); + self.candidates = candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 174fff35c..4f0bde422 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -13,9 +13,8 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct}; - -use crate::search::criteria::fetcher::{Fetcher, FetcherResult}; -use crate::{DocumentId, Index}; +use crate::search::criteria::r#final::{Final, FinalResult}; +use crate::{Index, DocumentId}; pub use self::facet::{ FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator, @@ -162,14 +161,14 @@ impl<'a> Search<'a> { &self, mut distinct: impl for<'c> Distinct<'c>, matching_words: MatchingWords, - mut criteria: Fetcher, + mut criteria: Final, ) -> anyhow::Result { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); let mut excluded_documents = RoaringBitmap::new(); let mut documents_ids = Vec::with_capacity(self.limit); - while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { + while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next()? { debug!("Number of candidates found {}", candidates.len()); let excluded = take(&mut excluded_documents); diff --git a/milli/src/tree_level.rs b/milli/src/tree_level.rs new file mode 100644 index 000000000..b69316cf6 --- /dev/null +++ b/milli/src/tree_level.rs @@ -0,0 +1,51 @@ +use std::convert::TryFrom; +use std::fmt; + +/// This is just before the lowest printable character (space, sp, 32) +const MAX_VALUE: u8 = 31; + +#[derive(Debug, Copy, Clone)] +pub enum Error { + LevelTooHigh(u8), +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(transparent)] +pub struct TreeLevel(u8); + +impl TreeLevel { + pub const fn max_value() -> TreeLevel { + TreeLevel(MAX_VALUE) + } + + pub const fn min_value() -> TreeLevel { + TreeLevel(0) + } + + pub fn saturating_sub(&self, lhs: u8) -> TreeLevel { + TreeLevel(self.0.saturating_sub(lhs)) + } +} + +impl Into for TreeLevel { + fn into(self) -> u8 { + self.0 + } +} + +impl TryFrom for TreeLevel { + type Error = Error; + + fn try_from(value: u8) -> Result { + match value { + 0..=MAX_VALUE => Ok(TreeLevel(value)), + _ => Err(Error::LevelTooHigh(value)), + } + } +} + +impl fmt::Display for TreeLevel { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 2c24d9c07..f89c2d00c 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -28,6 +28,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, @@ -55,6 +57,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; + word_level_position_docids.clear(self.wtxn)?; + word_prefix_level_position_docids.clear(self.wtxn)?; facet_field_id_value_docids.clear(self.wtxn)?; field_id_docid_facet_values.clear(self.wtxn)?; documents.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 8a2ba9bbf..4c5f8d61a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -88,6 +88,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, @@ -329,6 +331,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // We delete the documents ids that are under the word level position docids. + let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(bytes, &docids)?; + } + } + + drop(iter); + + // We delete the documents ids that are under the word prefix level position docids. + let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(bytes, &docids)?; + } + } + + drop(iter); + Ok(self.documents_ids.len()) } } diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 6f24fcad9..a6d008513 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -52,6 +52,14 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) - cbo_roaring_bitmap_merge(values) } +pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + cbo_roaring_bitmap_merge(values) +} + +pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + cbo_roaring_bitmap_merge(values) +} + pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { cbo_roaring_bitmap_merge(values) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 52949c13c..8ebdf1634 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2,7 +2,8 @@ use std::borrow::Cow; use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom}; -use std::num::NonZeroUsize; +use std::num::{NonZeroU32, NonZeroUsize}; +use std::str; use std::sync::mpsc::sync_channel; use std::time::Instant; @@ -13,17 +14,21 @@ use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionTy use heed::types::ByteSlice; use log::{debug, info, error}; use memmap::Mmap; -use rayon::ThreadPool; use rayon::prelude::*; +use rayon::ThreadPool; use serde::{Serialize, Deserialize}; use crate::index::Index; -use crate::update::{Facets, WordsPrefixes, UpdateIndexingStep}; +use crate::update::{ + Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep, + WordPrefixPairProximityDocids, +}; use self::store::{Store, Readers}; pub use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, - field_id_docid_facet_values_merge, + docid_word_positions_merge, documents_merge, + word_level_position_docids_merge, word_prefix_level_positions_docids_merge, + facet_field_value_docids_merge, field_id_docid_facet_values_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -262,6 +267,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { facet_min_level_size: Option, words_prefix_threshold: Option, max_prefix_length: Option, + words_positions_level_group_size: Option, + words_positions_min_level_size: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, @@ -289,6 +296,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { facet_min_level_size: None, words_prefix_threshold: None, max_prefix_length: None, + words_positions_level_group_size: None, + words_positions_min_level_size: None, update_method: IndexDocumentsMethod::ReplaceDocuments, update_format: UpdateFormat::Json, autogenerate_docids: true, @@ -402,6 +411,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { enum DatabaseType { Main, WordDocids, + WordLevel0PositionDocids, FacetLevel0ValuesDocids, } @@ -467,6 +477,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut word_docids_readers = Vec::with_capacity(readers.len()); let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); + let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len()); let mut documents_readers = Vec::with_capacity(readers.len()); @@ -476,6 +487,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { word_docids, docid_word_positions, words_pairs_proximities_docids, + word_level_position_docids, facet_field_value_docids, field_id_docid_facet_values, documents @@ -484,6 +496,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { word_docids_readers.push(word_docids); docid_word_positions_readers.push(docid_word_positions); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); + word_level_position_docids_readers.push(word_level_position_docids); facet_field_value_docids_readers.push(facet_field_value_docids); field_id_docid_facet_values_readers.push(field_id_docid_facet_values); documents_readers.push(documents); @@ -514,6 +527,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { facet_field_value_docids_readers, facet_field_value_docids_merge, ), + ( + DatabaseType::WordLevel0PositionDocids, + word_level_position_docids_readers, + word_level_position_docids_merge, + ), ] .into_par_iter() .for_each(|(dbtype, readers, merge)| { @@ -569,7 +587,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; - let total_databases = 7; + let total_databases = 8; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, @@ -661,7 +679,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { )?; }, DatabaseType::FacetLevel0ValuesDocids => { - debug!("Writing the facet values docids into LMDB on disk..."); + debug!("Writing the facet level 0 values docids into LMDB on disk..."); let db = *self.index.facet_field_id_value_docids.as_polymorph(); write_into_lmdb_database( self.wtxn, @@ -671,6 +689,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, + DatabaseType::WordLevel0PositionDocids => { + debug!("Writing the word level 0 positions docids into LMDB on disk..."); + let db = *self.index.word_level_position_docids.as_polymorph(); + write_into_lmdb_database( + self.wtxn, + db, + content, + word_level_position_docids_merge, + write_method, + )?; + } } database_count += 1; @@ -694,10 +723,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { builder.execute()?; // Run the words prefixes update operation. - let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id); - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id); if let Some(value) = self.words_prefix_threshold { builder.threshold(value); } @@ -706,6 +732,37 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + // Run the word prefix docids update operation. + let mut builder = WordPrefixDocids::new(self.wtxn, self.index); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + builder.max_nb_chunks = self.max_nb_chunks; + builder.max_memory = self.max_memory; + builder.execute()?; + + // Run the word prefix pair proximity docids update operation. + let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + builder.max_nb_chunks = self.max_nb_chunks; + builder.max_memory = self.max_memory; + builder.execute()?; + + // Run the words level positions update operation. + let mut builder = WordsLevelPositions::new(self.wtxn, self.index); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + if let Some(value) = self.words_positions_level_group_size { + builder.level_group_size(value); + } + if let Some(value) = self.words_positions_min_level_size { + builder.min_level_size(value); + } + builder.execute()?; + debug_assert_eq!(database_count, total_databases); info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 0bd83b692..0f97476d9 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -29,7 +29,8 @@ use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - facet_field_value_docids_merge, field_id_docid_facet_values_merge, + word_level_position_docids_merge, facet_field_value_docids_merge, + field_id_docid_facet_values_merge, }; const LMDB_MAX_KEY_LENGTH: usize = 511; @@ -43,6 +44,7 @@ pub struct Readers { pub word_docids: Reader, pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, + pub word_level_position_docids: Reader, pub facet_field_value_docids: Reader, pub field_id_docid_facet_values: Reader, pub documents: Reader, @@ -69,6 +71,7 @@ pub struct Store<'s, A> { main_sorter: Sorter, word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, + word_level_position_docids_sorter: Sorter, facet_field_value_docids_sorter: Sorter, field_id_docid_facet_values_sorter: Sorter, // MTBL writers @@ -94,7 +97,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { ) -> anyhow::Result { // We divide the max memory by the number of sorter the Store have. - let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4)); + let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); let main_sorter = create_sorter( @@ -121,6 +124,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_nb_chunks, max_memory, ); + let word_level_position_docids_sorter = create_sorter( + word_level_position_docids_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + max_memory, + ); let facet_field_value_docids_sorter = create_sorter( facet_field_value_docids_merge, chunk_compression_type, @@ -172,6 +183,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { main_sorter, word_docids_sorter, words_pairs_proximities_docids_sorter, + word_level_position_docids_sorter, facet_field_value_docids_sorter, field_id_docid_facet_values_sorter, // MTBL writers @@ -290,6 +302,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { self.documents_writer.insert(document_id.to_be_bytes(), record)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; + Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?; words_positions.clear(); @@ -360,6 +373,42 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } + fn write_word_position_docids( + writer: &mut Sorter, + document_id: DocumentId, + words_positions: &HashMap>, + ) -> anyhow::Result<()> + { + let mut key_buffer = Vec::new(); + let mut data_buffer = Vec::new(); + + for (word, positions) in words_positions { + key_buffer.clear(); + key_buffer.extend_from_slice(word.as_bytes()); + key_buffer.push(0); // level 0 + + for position in positions { + key_buffer.truncate(word.len() + 1); + let position_bytes = position.to_be_bytes(); + key_buffer.extend_from_slice(position_bytes.as_bytes()); + key_buffer.extend_from_slice(position_bytes.as_bytes()); + + data_buffer.clear(); + let positions = RoaringBitmap::from_iter(Some(document_id)); + // We serialize the positions into a buffer. + CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer) + .with_context(|| "could not serialize positions")?; + + // that we write under the generated key into MTBL + if lmdb_key_valid_size(&key_buffer) { + writer.insert(&key_buffer, &data_buffer)?; + } + } + } + + Ok(()) + } + fn write_facet_field_value_docids( sorter: &mut Sorter, iter: I, @@ -561,6 +610,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; + let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; + let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; @@ -570,6 +622,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; + let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?; let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; @@ -580,6 +633,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids, docid_word_positions, words_pairs_proximities_docids, + word_level_position_docids, facet_field_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index c2df94468..203937e2f 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -6,7 +6,10 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc pub use self::settings::{Setting, Settings}; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; -pub use self::words_prefixes::WordsPrefixes; +pub use self::word_prefix_docids::WordPrefixDocids; +pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; +pub use self::words_level_positions::WordsLevelPositions; +pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; @@ -16,5 +19,7 @@ mod index_documents; mod settings; mod update_builder; mod update_step; -mod words_prefixes; - +mod word_prefix_docids; +mod word_prefix_pair_proximity_docids; +mod words_level_positions; +mod words_prefixes_fst; diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index c966f72d2..8d6eb034d 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -2,7 +2,7 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::Index; -use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets, WordsPrefixes}; +use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -135,19 +135,4 @@ impl<'a> UpdateBuilder<'a> { builder } - - pub fn words_prefixes<'t, 'u, 'i>( - self, - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordsPrefixes<'t, 'u, 'i> - { - let mut builder = WordsPrefixes::new(wtxn, index, self.update_id); - - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - - builder - } } diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs new file mode 100644 index 000000000..58c984212 --- /dev/null +++ b/milli/src/update/word_prefix_docids.rs @@ -0,0 +1,75 @@ +use std::str; + +use crate::Index; +use fst::Streamer; +use grenad::CompressionType; +use heed::types::ByteSlice; + +use crate::update::index_documents::WriteMethod; +use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database}; + +pub struct WordPrefixDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, +} + +impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> { + WordPrefixDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + max_nb_chunks: None, + max_memory: None, + } + } + + pub fn execute(self) -> anyhow::Result<()> { + // Clear the word prefix docids database. + self.index.word_prefix_docids.clear(self.wtxn)?; + + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + + // It is forbidden to keep a mutable reference into the database + // and write into it at the same time, therefore we write into another file. + let mut prefix_docids_sorter = create_sorter( + word_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + // We iterate over all the prefixes and retrieve the corresponding docids. + let mut prefix_stream = prefix_fst.stream(); + while let Some(bytes) = prefix_stream.next() { + let prefix = str::from_utf8(bytes)?; + let db = self.index.word_docids.remap_data_type::(); + for result in db.prefix_iter(self.wtxn, prefix)? { + let (_word, data) = result?; + prefix_docids_sorter.insert(prefix, data)?; + } + } + + drop(prefix_fst); + + // We finally write the word prefix docids into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_docids.as_polymorph(), + prefix_docids_sorter, + word_docids_merge, + WriteMethod::Append, + )?; + + Ok(()) + } +} diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs new file mode 100644 index 000000000..c972efc4f --- /dev/null +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -0,0 +1,89 @@ +use std::str; + +use fst::automaton::{Automaton, Str}; +use fst::{Streamer, IntoStreamer}; +use grenad::CompressionType; +use heed::BytesEncode; +use heed::types::ByteSlice; +use log::debug; + +use crate::Index; +use crate::heed_codec::StrStrU8Codec; +use crate::update::index_documents::{ + WriteMethod, create_sorter, sorter_into_lmdb_database, + words_pairs_proximities_docids_merge, +}; + +pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, +} + +impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> + { + WordPrefixPairProximityDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + max_nb_chunks: None, + max_memory: None, + } + } + + pub fn execute(self) -> anyhow::Result<()> { + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; + + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + + // Here we create a sorter akin to the previous one. + let mut word_prefix_pair_proximity_docids_sorter = create_sorter( + words_pairs_proximities_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + // We insert all the word pairs corresponding to the word-prefix pairs + // where the prefixes appears in the prefix FST previously constructed. + let db = self.index.word_pair_proximity_docids.remap_data_type::(); + for result in db.iter(self.wtxn)? { + let ((word1, word2, prox), data) = result?; + let automaton = Str::new(word2).starts_with(); + let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); + while let Some(prefix) = matching_prefixes.next() { + let prefix = str::from_utf8(prefix)?; + let pair = (word1, prefix, prox); + let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); + word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; + } + } + + drop(prefix_fst); + + // We finally write the word prefix pair proximity docids into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + word_prefix_pair_proximity_docids_sorter, + words_pairs_proximities_docids_merge, + WriteMethod::Append, + )?; + + Ok(()) + } +} diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs new file mode 100644 index 000000000..1b772c37d --- /dev/null +++ b/milli/src/update/words_level_positions.rs @@ -0,0 +1,261 @@ +use std::{cmp, str}; +use std::convert::TryFrom; +use std::fs::File; +use std::num::NonZeroU32; + +use fst::automaton::{self, Automaton}; +use fst::{Streamer, IntoStreamer}; +use grenad::{CompressionType, Reader, Writer, FileFuse}; +use heed::types::{ByteSlice, DecodeIgnore, Str}; +use heed::{BytesEncode, Error}; +use log::debug; +use roaring::RoaringBitmap; + +use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; +use crate::update::index_documents::WriteMethod; +use crate::update::index_documents::{ + create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, + word_prefix_level_positions_docids_merge, sorter_into_lmdb_database +}; +use crate::{Index, TreeLevel}; + +pub struct WordsLevelPositions<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, +} + +impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> { + WordsLevelPositions { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + max_nb_chunks: None, + max_memory: None, + level_group_size: NonZeroU32::new(4).unwrap(), + min_level_size: NonZeroU32::new(5).unwrap(), + } + } + + pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { + self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); + self + } + + pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { + self.min_level_size = value; + self + } + + pub fn execute(self) -> anyhow::Result<()> { + debug!("Computing and writing the word levels positions docids into LMDB on disk..."); + + let entries = compute_positions_levels( + self.wtxn, + self.index.word_docids.remap_data_type::(), + self.index.word_level_position_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + )?; + + // The previously computed entries also defines the level 0 entries + // so we can clear the database and append all of these entries. + self.index.word_level_position_docids.clear(self.wtxn)?; + + write_into_lmdb_database( + self.wtxn, + *self.index.word_level_position_docids.as_polymorph(), + entries, + |_, _| anyhow::bail!("invalid word level position merging"), + WriteMethod::Append, + )?; + + // We compute the word prefix level positions database. + self.index.word_prefix_level_position_docids.clear(self.wtxn)?; + + let mut word_prefix_level_positions_docids_sorter = create_sorter( + word_prefix_level_positions_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + // We insert the word prefix level positions where the level is equal to 0 and + // corresponds to the word-prefix level positions where the prefixes appears + // in the prefix FST previously constructed. + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + let db = self.index.word_level_position_docids.remap_data_type::(); + for result in db.iter(self.wtxn)? { + let ((word, level, left, right), data) = result?; + if level == TreeLevel::min_value() { + let automaton = automaton::Str::new(word).starts_with(); + let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); + while let Some(prefix) = matching_prefixes.next() { + let prefix = str::from_utf8(prefix)?; + let key = (prefix, level, left, right); + let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap(); + word_prefix_level_positions_docids_sorter.insert(bytes, data)?; + } + } + } + + // We finally write all the word prefix level positions docids with + // a level equal to 0 into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_level_position_docids.as_polymorph(), + word_prefix_level_positions_docids_sorter, + word_prefix_level_positions_docids_merge, + WriteMethod::Append, + )?; + + let entries = compute_positions_levels( + self.wtxn, + self.index.word_prefix_docids.remap_data_type::(), + self.index.word_prefix_level_position_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + )?; + + // The previously computed entries also defines the level 0 entries + // so we can clear the database and append all of these entries. + self.index.word_prefix_level_position_docids.clear(self.wtxn)?; + + write_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_level_position_docids.as_polymorph(), + entries, + |_, _| anyhow::bail!("invalid word prefix level position merging"), + WriteMethod::Append, + )?; + + Ok(()) + } +} + +/// Returns the next number after or equal to `x` that is divisible by `d`. +fn next_divisible(x: u32, d: u32) -> u32 { + (x.saturating_sub(1) | (d - 1)) + 1 +} + +/// Returns the previous number after or equal to `x` that is divisible by `d`, +/// saturates on zero. +fn previous_divisible(x: u32, d: u32) -> u32 { + match x.checked_sub(d - 1) { + Some(0) | None => 0, + Some(x) => next_divisible(x, d), + } +} + +/// Generates all the words positions levels based on the levels zero (including the level zero). +fn compute_positions_levels( + rtxn: &heed::RoTxn, + words_db: heed::Database, + words_positions_db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + shrink_size: Option, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, +) -> anyhow::Result> +{ + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(compression_type, compression_level, file) + })?; + + for result in words_db.iter(rtxn)? { + let (word, ()) = result?; + + let level_0_range = { + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::min_value(), u32::max_value(), u32::max_value()); + left..=right + }; + + let first_level_size = words_positions_db.remap_data_type::() + .range(rtxn, &level_0_range)? + .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; + + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (TreeLevel::try_from(l).unwrap(), level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + + // As specified in the documentation, we also write the level 0 entries. + for result in words_positions_db.range(rtxn, &level_0_range)? { + let ((word, level, left, right), docids) = result?; + write_level_entry(&mut writer, word, level, left, right, &docids)?; + } + + for (level, group_size) in group_size_iter { + let mut left = 0; + let mut right = 0; + let mut group_docids = RoaringBitmap::new(); + + for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() { + let ((_word, _level, value, _right), docids) = result?; + + if i == 0 { + left = previous_divisible(value, group_size); + right = left + (group_size - 1); + } + + if value > right { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. + write_level_entry(&mut writer, word, level, left, right, &group_docids)?; + + // We save the left bound for the new group and also reset the docids. + group_docids = RoaringBitmap::new(); + left = previous_divisible(value, group_size); + right = left + (group_size - 1); + } + + // The right bound is always the bound we run through. + group_docids.union_with(&docids); + } + + if !group_docids.is_empty() { + write_level_entry(&mut writer, word, level, left, right, &group_docids)?; + } + } + } + + writer_into_reader(writer, shrink_size) +} + +fn write_level_entry( + writer: &mut Writer, + word: &str, + level: TreeLevel, + left: u32, + right: u32, + ids: &RoaringBitmap, +) -> anyhow::Result<()> +{ + let key = (word, level, left, right); + let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) +} diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs deleted file mode 100644 index f2fe526a2..000000000 --- a/milli/src/update/words_prefixes.rs +++ /dev/null @@ -1,196 +0,0 @@ -use std::iter::FromIterator; -use std::str; - -use chrono::Utc; -use fst::automaton::Str; -use fst::{Automaton, Streamer, IntoStreamer}; -use grenad::CompressionType; -use heed::BytesEncode; -use heed::types::ByteSlice; - -use crate::heed_codec::StrStrU8Codec; -use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database}; -use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge}; -use crate::{Index, SmallString32}; - -pub struct WordsPrefixes<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, - threshold: f64, - max_prefix_length: usize, - _update_id: u64, -} - -impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - update_id: u64, - ) -> WordsPrefixes<'t, 'u, 'i> - { - WordsPrefixes { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - chunk_fusing_shrink_size: None, - max_nb_chunks: None, - max_memory: None, - threshold: 0.1 / 100.0, // .01% - max_prefix_length: 4, - _update_id: update_id, - } - } - - /// Set the ratio of concerned words required to make a prefix be part of the words prefixes - /// database. If a word prefix is supposed to match more than this number of words in the - /// dictionnary, therefore this prefix is added to the words prefixes datastructures. - /// - /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped - /// to these bounds otherwise. - pub fn threshold(&mut self, value: f64) -> &mut Self { - self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] - self - } - - /// Set the maximum length of prefixes in bytes. - /// - /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped - /// to these bounds, otherwise. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value.min(25).max(1); // clamp [1, 25] - self - } - - pub fn execute(self) -> anyhow::Result<()> { - self.index.set_updated_at(self.wtxn, &Utc::now())?; - // Clear the words prefixes datastructures. - self.index.word_prefix_docids.clear(self.wtxn)?; - self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; - - let words_fst = self.index.words_fst(&self.wtxn)?; - let number_of_words = words_fst.len(); - let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; - - // It is forbidden to keep a mutable reference into the database - // and write into it at the same time, therefore we write into another file. - let mut prefix_docids_sorter = create_sorter( - word_docids_merge, - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.max_nb_chunks, - self.max_memory, - ); - - let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); - for n in 1..=self.max_prefix_length { - - let mut current_prefix = SmallString32::new(); - let mut current_prefix_count = 0; - let mut builder = fst::SetBuilder::memory(); - - let mut stream = words_fst.stream(); - while let Some(bytes) = stream.next() { - // We try to get the first n bytes out of this string but we only want - // to split at valid characters bounds. If we try to split in the middle of - // a character we ignore this word and go to the next one. - let word = str::from_utf8(bytes)?; - let prefix = match word.get(..n) { - Some(prefix) => prefix, - None => continue, - }; - - // This is the first iteration of the loop, - // or the current word doesn't starts with the current prefix. - if current_prefix_count == 0 || prefix != current_prefix.as_str() { - current_prefix = SmallString32::from(prefix); - current_prefix_count = 0; - } - - current_prefix_count += 1; - - // There is enough words corresponding to this prefix to add it to the cache. - if current_prefix_count == min_number_of_words { - builder.insert(prefix)?; - } - } - - // We construct the final set for prefixes of size n. - prefix_fsts.push(builder.into_set()); - } - - // We merge all of the previously computed prefixes into on final set. - let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); - let mut builder = fst::SetBuilder::memory(); - builder.extend_stream(op.r#union())?; - let prefix_fst = builder.into_set(); - - // We iterate over all the prefixes and retrieve the corresponding docids. - let mut prefix_stream = prefix_fst.stream(); - while let Some(bytes) = prefix_stream.next() { - let prefix = str::from_utf8(bytes)?; - let db = self.index.word_docids.remap_data_type::(); - for result in db.prefix_iter(self.wtxn, prefix)? { - let (_word, data) = result?; - prefix_docids_sorter.insert(prefix, data)?; - } - } - - // Set the words prefixes FST in the dtabase. - self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; - - // We finally write the word prefix docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_docids.as_polymorph(), - prefix_docids_sorter, - word_docids_merge, - WriteMethod::Append, - )?; - - // We compute the word prefix pair proximity database. - - // Here we create a sorter akin to the previous one. - let mut word_prefix_pair_proximity_docids_sorter = create_sorter( - words_pairs_proximities_docids_merge, - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.max_nb_chunks, - self.max_memory, - ); - - // We insert all the word pairs corresponding to the word-prefix pairs - // where the prefixes appears in the prefix FST previously constructed. - let db = self.index.word_pair_proximity_docids.remap_data_type::(); - for result in db.iter(self.wtxn)? { - let ((word1, word2, prox), data) = result?; - let automaton = Str::new(word2).starts_with(); - let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); - while let Some(prefix) = matching_prefixes.next() { - let prefix = str::from_utf8(prefix)?; - let pair = (word1, prefix, prox); - let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); - word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; - } - } - - // We finally write the word prefix pair proximity docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - word_prefix_pair_proximity_docids_sorter, - words_pairs_proximities_docids_merge, - WriteMethod::Append, - )?; - - Ok(()) - } -} diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs new file mode 100644 index 000000000..f53b0ee00 --- /dev/null +++ b/milli/src/update/words_prefixes_fst.rs @@ -0,0 +1,104 @@ +use std::iter::FromIterator; +use std::str; + +use fst::Streamer; +use crate::{Index, SmallString32}; + +pub struct WordsPrefixesFst<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + threshold: f64, + max_prefix_length: usize, + _update_id: u64, +} + +impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + update_id: u64, + ) -> WordsPrefixesFst<'t, 'u, 'i> + { + WordsPrefixesFst { + wtxn, + index, + threshold: 0.1 / 100.0, // .01% + max_prefix_length: 4, + _update_id: update_id, + } + } + + /// Set the ratio of concerned words required to make a prefix be part of the words prefixes + /// database. If a word prefix is supposed to match more than this number of words in the + /// dictionnary, therefore this prefix is added to the words prefixes datastructures. + /// + /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped + /// to these bounds otherwise. + pub fn threshold(&mut self, value: f64) -> &mut Self { + self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] + self + } + + /// Set the maximum length of prefixes in bytes. + /// + /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped + /// to these bounds, otherwise. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value.min(25).max(1); // clamp [1, 25] + self + } + + pub fn execute(self) -> anyhow::Result<()> { + let words_fst = self.index.words_fst(&self.wtxn)?; + let number_of_words = words_fst.len(); + let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; + + let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); + for n in 1..=self.max_prefix_length { + + let mut current_prefix = SmallString32::new(); + let mut current_prefix_count = 0; + let mut builder = fst::SetBuilder::memory(); + + let mut stream = words_fst.stream(); + while let Some(bytes) = stream.next() { + // We try to get the first n bytes out of this string but we only want + // to split at valid characters bounds. If we try to split in the middle of + // a character we ignore this word and go to the next one. + let word = str::from_utf8(bytes)?; + let prefix = match word.get(..n) { + Some(prefix) => prefix, + None => continue, + }; + + // This is the first iteration of the loop, + // or the current word doesn't starts with the current prefix. + if current_prefix_count == 0 || prefix != current_prefix.as_str() { + current_prefix = SmallString32::from(prefix); + current_prefix_count = 0; + } + + current_prefix_count += 1; + + // There is enough words corresponding to this prefix to add it to the cache. + if current_prefix_count == min_number_of_words { + builder.insert(prefix)?; + } + } + + // We construct the final set for prefixes of size n. + prefix_fsts.push(builder.into_set()); + } + + // We merge all of the previously computed prefixes into on final set. + let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(op.r#union())?; + let prefix_fst = builder.into_set(); + + // Set the words prefixes FST in the dtabase. + self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; + + Ok(()) + } +}