From 3aaf1d62f307977dbf2264f9919bb86ef14b0b92 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 16 Aug 2021 13:35:03 +0200 Subject: [PATCH 01/15] Publish grenad CompressionType type in milli --- milli/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 5a5f2ac5c..a07303fd2 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -54,7 +54,7 @@ pub type FieldId = u16; pub type Position = u32; pub type FieldDistribution = BTreeMap; -type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; +type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( From 1d314328f0b60344c3cdc2f50efaa0c787d9c48d Mon Sep 17 00:00:00 2001 From: many Date: Mon, 16 Aug 2021 13:36:30 +0200 Subject: [PATCH 02/15] Plug new indexer --- http-ui/src/main.rs | 3 - milli/Cargo.toml | 4 +- .../facet_string_level_zero_value_codec.rs | 51 +- milli/src/heed_codec/facet/mod.rs | 4 +- milli/src/index.rs | 3 +- milli/src/lib.rs | 4 - milli/src/proximity.rs | 4 +- milli/src/search/criteria/exactness.rs | 4 + milli/src/search/facet/facet_string.rs | 22 +- milli/src/search/facet/filter_condition.rs | 5 +- milli/src/update/delete_documents.rs | 7 +- milli/src/update/facets.rs | 27 +- .../extract/extract_docid_word_positions.rs | 130 +++ .../extract/extract_facet_number_docids.rs | 41 + .../extract/extract_facet_string_docids.rs | 57 + .../extract/extract_fid_docid_facet_values.rs | 118 +++ .../extract/extract_fid_word_count_docids.rs | 91 ++ .../extract/extract_word_docids.rs | 42 + .../extract_word_level_position_docids.rs | 46 + .../extract_word_pair_proximity_docids.rs | 196 ++++ .../src/update/index_documents/extract/mod.rs | 199 ++++ .../index_documents/helpers/clonable_mmap.rs | 22 + .../index_documents/helpers/grenad_helpers.rs | 276 +++++ .../helpers/merge_functions.rs | 171 +++ .../src/update/index_documents/helpers/mod.rs | 49 + .../update/index_documents/merge_function.rs | 106 -- milli/src/update/index_documents/mod.rs | 696 ++----------- milli/src/update/index_documents/store.rs | 985 ------------------ milli/src/update/index_documents/transform.rs | 41 +- .../src/update/index_documents/typed_chunk.rs | 272 +++++ milli/src/update/settings.rs | 7 - milli/src/update/update_builder.rs | 17 - milli/src/update/word_prefix_docids.rs | 9 +- .../word_prefix_pair_proximity_docids.rs | 9 +- milli/src/update/words_level_positions.rs | 22 +- milli/tests/search/mod.rs | 6 +- 36 files changed, 1920 insertions(+), 1826 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_docid_word_positions.rs create mode 100644 milli/src/update/index_documents/extract/extract_facet_number_docids.rs create mode 100644 milli/src/update/index_documents/extract/extract_facet_string_docids.rs create mode 100644 milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs create mode 100644 milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs create mode 100644 milli/src/update/index_documents/extract/extract_word_docids.rs create mode 100644 milli/src/update/index_documents/extract/extract_word_level_position_docids.rs create mode 100644 milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs create mode 100644 milli/src/update/index_documents/extract/mod.rs create mode 100644 milli/src/update/index_documents/helpers/clonable_mmap.rs create mode 100644 milli/src/update/index_documents/helpers/grenad_helpers.rs create mode 100644 milli/src/update/index_documents/helpers/merge_functions.rs create mode 100644 milli/src/update/index_documents/helpers/mod.rs delete mode 100644 milli/src/update/index_documents/merge_function.rs create mode 100644 milli/src/update/index_documents/typed_chunk.rs diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 83995c3e5..fd7dd37de 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -343,10 +343,7 @@ async fn main() -> anyhow::Result<()> { update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap()); update_builder.log_every_n(indexer_opt_cloned.log_every_n); update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); - update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size); update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type); - update_builder - .chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes()); let before_update = Instant::now(); // we extract the update type and execute the update itself. diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3baa2213d..edcec4d5b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -6,15 +6,17 @@ edition = "2018" [dependencies] bstr = "0.2.15" +byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } byteorder = "1.4.2" chrono = { version = "0.4.19", features = ["serde"] } concat-arrays = "0.1.2" +crossbeam-channel = "0.5.1" csv = "1.1.5" either = "1.6.1" flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" -grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } +grenad = "0.3.0" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs index b2434d453..914d7c3cd 100644 --- a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -2,51 +2,65 @@ use std::borrow::Cow; use std::convert::TryInto; use std::{marker, str}; -use super::try_split_at; +use crate::error::SerializationError; +use crate::heed_codec::RoaringBitmapCodec; +use crate::{try_split_array_at, try_split_at, Result}; +pub type FacetStringLevelZeroValueCodec = StringValueCodec; -/// A codec that encodes a string in front of the value. +/// A codec that encodes a string in front of a value. /// /// The usecase is for the facet string levels algorithm where we must know the /// original string of a normalized facet value, the original values are stored /// in the value to not break the lexicographical ordering of the LMDB keys. -pub struct FacetStringLevelZeroValueCodec(marker::PhantomData); +pub struct StringValueCodec(marker::PhantomData); -impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec +impl<'a, C> heed::BytesDecode<'a> for StringValueCodec where C: heed::BytesDecode<'a>, { type DItem = (&'a str, C::DItem); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (string_len, bytes) = try_split_at(bytes, 2)?; - let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?; - - let (string, bytes) = try_split_at(bytes, string_len as usize)?; - let string = str::from_utf8(string).ok()?; + let (string, bytes) = decode_prefix_string(bytes)?; C::bytes_decode(bytes).map(|item| (string, item)) } } -impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec +impl<'a, C> heed::BytesEncode<'a> for StringValueCodec where C: heed::BytesEncode<'a>, { type EItem = (&'a str, C::EItem); fn bytes_encode((string, value): &'a Self::EItem) -> Option> { - let string_len: u16 = string.len().try_into().ok()?; let value_bytes = C::bytes_encode(&value)?; let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); - bytes.extend_from_slice(&string_len.to_be_bytes()); - bytes.extend_from_slice(string.as_bytes()); + encode_prefix_string(string, &mut bytes).ok()?; bytes.extend_from_slice(&value_bytes[..]); Some(Cow::Owned(bytes)) } } +pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> { + let (original_length_bytes, bytes) = try_split_array_at(value)?; + let original_length = u16::from_be_bytes(original_length_bytes) as usize; + let (string, bytes) = try_split_at(bytes, original_length)?; + let string = str::from_utf8(string).ok()?; + + Some((string, bytes)) +} + +pub fn encode_prefix_string(string: &str, buffer: &mut Vec) -> Result<()> { + let string_len: u16 = + string.len().try_into().map_err(|_| SerializationError::InvalidNumberSerialization)?; + buffer.extend_from_slice(&string_len.to_be_bytes()); + buffer.extend_from_slice(string.as_bytes()); + Ok(()) +} + #[cfg(test)] mod tests { use heed::types::Unit; @@ -54,17 +68,15 @@ mod tests { use roaring::RoaringBitmap; use super::*; - use crate::CboRoaringBitmapCodec; #[test] fn deserialize_roaring_bitmaps() { let string = "abc"; let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); let key = (string, docids.clone()); - let bytes = - FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); + let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); let (out_string, out_docids) = - FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + StringValueCodec::::bytes_decode(&bytes).unwrap(); assert_eq!((out_string, out_docids), (string, docids)); } @@ -72,9 +84,8 @@ mod tests { fn deserialize_unit() { let string = "def"; let key = (string, ()); - let bytes = FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); - let (out_string, out_unit) = - FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); + let (out_string, out_unit) = StringValueCodec::::bytes_decode(&bytes).unwrap(); assert_eq!((out_string, out_unit), (string, ())); } } diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index a6a805bf7..e93fb57b9 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -9,7 +9,9 @@ mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; -pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec; +pub use self::facet_string_level_zero_value_codec::{ + decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, +}; pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/index.rs b/milli/src/index.rs index e2ab51a1c..f3a2a3e05 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -93,8 +93,7 @@ pub struct Index { /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, /// Maps the facet field id and the string with the original string and docids that corresponds to it. - pub facet_id_string_docids: - Database>, + pub facet_id_string_docids: Database, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a07303fd2..af811fe08 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -13,11 +13,9 @@ mod search; pub mod tree_level; pub mod update; -use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; -use std::result::Result as StdResult; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; @@ -54,8 +52,6 @@ pub type FieldId = u16; pub type Position = u32; pub type FieldDistribution = BTreeMap; -type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; - /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], diff --git a/milli/src/proximity.rs b/milli/src/proximity.rs index db98426a5..083e5a977 100644 --- a/milli/src/proximity.rs +++ b/milli/src/proximity.rs @@ -2,8 +2,8 @@ use std::cmp; use crate::{Attribute, Position}; -const ONE_ATTRIBUTE: u32 = 1000; -const MAX_DISTANCE: u32 = 8; +pub const ONE_ATTRIBUTE: u32 = 1000; +pub const MAX_DISTANCE: u32 = 8; pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { if lhs <= rhs { diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 1e4d4e7a2..22dcb9782 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -180,6 +180,10 @@ fn resolve_state( if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { + println!( + "found candidates that have the good count: {:?}", + attribute_allowed_docids + ); let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; attribute_candidates_array.push(attribute_allowed_docids); diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 927602c98..747b7fd3c 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -269,11 +269,7 @@ impl<'t> Iterator for FacetStringGroupRevRange<'t> { /// /// It yields the facet string and the roaring bitmap associated with it. pub struct FacetStringLevelZeroRange<'t> { - iter: RoRange< - 't, - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - >, + iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, } impl<'t> FacetStringLevelZeroRange<'t> { @@ -316,10 +312,7 @@ impl<'t> FacetStringLevelZeroRange<'t> { let iter = db .remap_key_type::() .range(rtxn, &(left_bound, right_bound))? - .remap_types::< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec - >(); + .remap_types::(); Ok(FacetStringLevelZeroRange { iter }) } @@ -340,11 +333,7 @@ impl<'t> Iterator for FacetStringLevelZeroRange<'t> { } pub struct FacetStringLevelZeroRevRange<'t> { - iter: RoRevRange< - 't, - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - >, + iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, } impl<'t> FacetStringLevelZeroRevRange<'t> { @@ -387,10 +376,7 @@ impl<'t> FacetStringLevelZeroRevRange<'t> { let iter = db .remap_key_type::() .rev_range(rtxn, &(left_bound, right_bound))? - .remap_types::< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec - >(); + .remap_types::(); Ok(FacetStringLevelZeroRevRange { iter }) } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 5ca9f7e5a..a92797e90 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -392,10 +392,7 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, numbers_db: heed::Database, - strings_db: heed::Database< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - >, + strings_db: heed::Database, field_id: FieldId, operator: &Operator, ) -> Result { diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e18c6bbd1..874eed6ee 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -490,7 +490,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( None => { // The key corresponds to a level zero facet string. let (original_value, mut docids) = - FacetStringLevelZeroValueCodec::::bytes_decode(val) + FacetStringLevelZeroValueCodec::bytes_decode(val) .ok_or_else(|| SerializationError::Decoding { db_name })?; let previous_len = docids.len(); @@ -501,9 +501,8 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( } else if docids.len() != previous_len { let key = key.to_owned(); let val = &(original_value, docids); - let value_bytes = - FacetStringLevelZeroValueCodec::::bytes_encode(val) - .ok_or_else(|| SerializationError::Encoding { db_name })?; + let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) + .ok_or_else(|| SerializationError::Encoding { db_name })?; // safety: we don't keep references from inside the LMDB database. unsafe { iter.put_current(&key, &value_bytes)? }; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index cb9a90f7e..3ae63f282 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -3,7 +3,7 @@ use std::num::{NonZeroU8, NonZeroUsize}; use std::{cmp, mem}; use chrono::Utc; -use grenad::{CompressionType, FileFuse, Reader, Writer}; +use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesEncode, Error}; use log::debug; @@ -25,7 +25,6 @@ pub struct Facets<'t, 'u, 'i> { index: &'i Index, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, _update_id: u64, @@ -42,7 +41,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { index, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, level_group_size: NonZeroUsize::new(4).unwrap(), min_level_size: NonZeroUsize::new(5).unwrap(), _update_id: update_id, @@ -86,7 +84,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.index.facet_id_string_docids, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.level_group_size, self.min_level_size, field_id, @@ -107,7 +104,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.index.facet_id_f64_docids, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.level_group_size, self.min_level_size, field_id, @@ -128,7 +124,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), facet_number_levels, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" }), + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" })?, WriteMethod::GetMergePut, )?; @@ -136,7 +132,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_string_levels, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" }), + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" })?, WriteMethod::GetMergePut, )?; } @@ -161,11 +157,10 @@ fn compute_facet_number_levels<'t>( db: heed::Database, compression_type: CompressionType, compression_level: Option, - shrink_size: Option, level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, field_id: FieldId, -) -> Result> { +) -> Result> { let first_level_size = db .remap_key_type::() .prefix_iter(rtxn, &field_id.to_be_bytes())? @@ -219,7 +214,7 @@ fn compute_facet_number_levels<'t>( } } - writer_into_reader(writer, shrink_size) + writer_into_reader(writer) } fn write_number_entry( @@ -239,7 +234,7 @@ fn write_number_entry( fn compute_faceted_strings_documents_ids( rtxn: &heed::RoTxn, - db: heed::Database>, + db: heed::Database, field_id: FieldId, ) -> Result { let mut documents_ids = RoaringBitmap::new(); @@ -278,17 +273,13 @@ fn clear_field_string_levels<'t>( fn compute_facet_string_levels<'t>( rtxn: &'t heed::RoTxn, - db: heed::Database< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - >, + db: heed::Database, compression_type: CompressionType, compression_level: Option, - shrink_size: Option, level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, field_id: FieldId, -) -> Result> { +) -> Result> { let first_level_size = db .remap_key_type::() .prefix_iter(rtxn, &field_id.to_be_bytes())? @@ -340,7 +331,7 @@ fn compute_facet_string_levels<'t>( } } - writer_into_reader(writer, shrink_size) + writer_into_reader(writer) } fn write_string_entry( diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs new file mode 100644 index 000000000..9a9d7cb85 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -0,0 +1,130 @@ +use std::collections::HashSet; +use std::convert::TryInto; +use std::fs::File; +use std::{io, mem, str}; + +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; +use roaring::RoaringBitmap; +use serde_json::Value; + +use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; +use crate::error::{InternalError, SerializationError}; +use crate::proximity::ONE_ATTRIBUTE; +use crate::{FieldId, Result}; + +/// Extracts the word and positions where this word appear and +/// prefixes it by the document id. +/// +/// Returns the generated internal documents ids and a grenad reader +/// with the list of extracted words from the given chunk of documents. +pub fn extract_docid_word_positions( + mut obkv_documents: grenad::Reader, + indexer: GrenadParameters, + searchable_fields: &Option>, +) -> Result<(RoaringBitmap, grenad::Reader)> { + let max_memory = indexer.max_memory_by_thread(); + + let mut documents_ids = RoaringBitmap::new(); + let mut docid_word_positions_sorter = create_sorter( + concat_u32s_array, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut field_buffer = String::new(); + let analyzer = Analyzer::>::new(AnalyzerConfig::default()); + + while let Some((key, value)) = obkv_documents.next()? { + let document_id = key + .try_into() + .map(u32::from_be_bytes) + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let obkv = obkv::KvReader::::new(value); + + documents_ids.push(document_id); + key_buffer.clear(); + key_buffer.extend_from_slice(&document_id.to_be_bytes()); + + for (field_id, field_bytes) in obkv.iter() { + if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + field_buffer.clear(); + if let Some(field) = json_to_string(&value, &mut field_buffer) { + let analyzed = analyzer.analyze(field); + let tokens = analyzed + .tokens() + .filter(Token::is_word) + .enumerate() + .take_while(|(i, _)| (*i as u32) < ONE_ATTRIBUTE); + + for (index, token) in tokens { + let token = token.text().trim(); + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(token.as_bytes()); + + let position: u32 = index + .try_into() + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let position = field_id as u32 * ONE_ATTRIBUTE + position; + docid_word_positions_sorter.insert(&key_buffer, &position.to_ne_bytes())?; + } + } + } + } + } + + sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader)) +} + +/// Transform a JSON value into a string that can be indexed. +fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> { + fn inner(value: &Value, output: &mut String) -> bool { + use std::fmt::Write; + match value { + Value::Null => false, + Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), + Value::Number(number) => write!(output, "{}", number).is_ok(), + Value::String(string) => write!(output, "{}", string).is_ok(), + Value::Array(array) => { + let mut count = 0; + for value in array { + if inner(value, output) { + output.push_str(". "); + count += 1; + } + } + // check that at least one value was written + count != 0 + } + Value::Object(object) => { + let mut buffer = String::new(); + let mut count = 0; + for (key, value) in object { + buffer.clear(); + let _ = write!(&mut buffer, "{}: ", key); + if inner(value, &mut buffer) { + buffer.push_str(". "); + // We write the "key: value. " pair only when + // we are sure that the value can be written. + output.push_str(&buffer); + count += 1; + } + } + // check that at least one value was written + count != 0 + } + } + } + + if let Value::String(string) = value { + Some(&string) + } else if inner(value, buffer) { + Some(buffer) + } else { + None + } +} diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs new file mode 100644 index 000000000..1734ef028 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -0,0 +1,41 @@ +use std::fs::File; +use std::io; + +use heed::{BytesDecode, BytesEncode}; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, +}; +use crate::heed_codec::facet::{FacetLevelValueF64Codec, FieldDocIdFacetF64Codec}; +use crate::Result; + +/// Extracts the facet number and the documents ids where this facet number appear. +/// +/// Returns a grenad reader with the list of extracted facet numbers and +/// documents ids from the given chunk of docid facet number positions. +pub fn extract_facet_number_docids( + mut docid_fid_facet_number: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_number_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + while let Some((key_bytes, _)) = docid_fid_facet_number.next()? { + let (field_id, document_id, number) = + FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); + + let key = (field_id, 0, number, number); + let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); + + facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + } + + sorter_into_reader(facet_number_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs new file mode 100644 index 000000000..66ede5f42 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -0,0 +1,57 @@ +use std::fs::File; +use std::iter::FromIterator; +use std::{io, str}; + +use roaring::RoaringBitmap; + +use super::helpers::{ + create_sorter, keep_first_prefix_value_merge_roaring_bitmaps, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; +use crate::{FieldId, Result}; + +/// Extracts the facet string and the documents ids where this facet string appear. +/// +/// Returns a grenad reader with the list of extracted facet strings and +/// documents ids from the given chunk of docid facet string positions. +pub fn extract_facet_string_docids( + mut docid_fid_facet_string: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_string_docids_sorter = create_sorter( + keep_first_prefix_value_merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut value_buffer = Vec::new(); + while let Some((key, original_value_bytes)) = docid_fid_facet_string.next()? { + let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); + let field_id = FieldId::from_be_bytes(field_id_bytes); + let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + let original_value = str::from_utf8(original_value_bytes)?; + + key_buffer.clear(); + FacetStringLevelZeroCodec::serialize_into( + field_id, + str::from_utf8(normalized_value_bytes)?, + &mut key_buffer, + ); + + value_buffer.clear(); + encode_prefix_string(original_value, &mut value_buffer)?; + let bitmap = RoaringBitmap::from_iter(Some(document_id)); + bitmap.serialize_into(&mut value_buffer)?; + + facet_string_docids_sorter.insert(&key_buffer, &value_buffer)?; + } + + sorter_into_reader(facet_string_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs new file mode 100644 index 000000000..e7e56a3c8 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -0,0 +1,118 @@ +use std::collections::HashSet; +use std::fs::File; +use std::io; +use std::mem::size_of; + +use heed::zerocopy::AsBytes; +use serde_json::Value; + +use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; +use crate::error::InternalError; +use crate::facet::value_encoding::f64_into_bytes; +use crate::{DocumentId, FieldId, Result}; + +/// Extracts the facet values of each faceted field of each document. +/// +/// Returns the generated grenad reader containing the docid the fid and the orginal value as key +/// and the normalized value as value extracted from the given chunk of documents. +pub fn extract_fid_docid_facet_values( + mut obkv_documents: grenad::Reader, + indexer: GrenadParameters, + faceted_fields: &HashSet, +) -> Result<(grenad::Reader, grenad::Reader)> { + let max_memory = indexer.max_memory_by_thread(); + + let mut fid_docid_facet_numbers_sorter = create_sorter( + keep_first, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut fid_docid_facet_strings_sorter = create_sorter( + keep_first, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut key_buffer = Vec::new(); + while let Some((docid_bytes, value)) = obkv_documents.next()? { + let obkv = obkv::KvReader::new(value); + + for (field_id, field_bytes) in obkv.iter() { + if faceted_fields.contains(&field_id) { + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + let (numbers, strings) = extract_facet_values(&value); + + key_buffer.clear(); + + // prefix key with the field_id and the document_id + key_buffer.extend_from_slice(&field_id.to_be_bytes()); + key_buffer.extend_from_slice(&docid_bytes); + + // insert facet numbers in sorter + for number in numbers { + key_buffer.truncate(size_of::() + size_of::()); + let value_bytes = f64_into_bytes(number).unwrap(); // invalid float + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; + } + + // insert normalized and original facet string in sorter + for (normalized, original) in strings { + key_buffer.truncate(size_of::() + size_of::()); + key_buffer.extend_from_slice(normalized.as_bytes()); + fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; + } + } + } + } + + Ok(( + sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, + sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + )) +} + +fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { + fn inner_extract_facet_values( + value: &Value, + can_recurse: bool, + output_numbers: &mut Vec, + output_strings: &mut Vec<(String, String)>, + ) { + match value { + Value::Null => (), + Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), + Value::Number(number) => { + if let Some(float) = number.as_f64() { + output_numbers.push(float); + } + } + Value::String(original) => { + let normalized = original.trim().to_lowercase(); + output_strings.push((normalized, original.clone())); + } + Value::Array(values) => { + if can_recurse { + for value in values { + inner_extract_facet_values(value, false, output_numbers, output_strings); + } + } + } + Value::Object(_) => (), + } + } + + let mut facet_number_values = Vec::new(); + let mut facet_string_values = Vec::new(); + inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); + + (facet_number_values, facet_string_values) +} diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs new file mode 100644 index 000000000..66b179663 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -0,0 +1,91 @@ +use std::collections::HashMap; +use std::fs::File; +use std::{cmp, io}; + +use grenad::Sorter; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, MergeFn, +}; +use crate::proximity::extract_position; +use crate::{DocumentId, FieldId, Result}; + +/// Extracts the field id word count and the documents ids where +/// this field id with this amount of words appear. +/// +/// Returns a grenad reader with the list of extracted field id word counts +/// and documents ids from the given chunk of docid word positions. +pub fn extract_fid_word_count_docids( + mut docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut fid_word_count_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + // This map is assumed to not consume a lot of memory. + let mut document_fid_wordcount = HashMap::new(); + let mut current_document_id = None; + + while let Some((key, value)) = docid_word_positions.next()? { + let (document_id_bytes, _word_bytes) = try_split_array_at(key).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + + let curr_document_id = *current_document_id.get_or_insert(document_id); + if curr_document_id != document_id { + drain_document_fid_wordcount_into_sorter( + &mut fid_word_count_docids_sorter, + &mut document_fid_wordcount, + curr_document_id, + )?; + current_document_id = Some(document_id); + } + + for position in read_u32_ne_bytes(value) { + let (field_id, position) = extract_position(position); + let word_count = position + 1; + + let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); + *value = cmp::max(*value, word_count); + } + } + + if let Some(document_id) = current_document_id { + // We must make sure that don't lose the current document field id + // word count map if we break because we reached the end of the chunk. + drain_document_fid_wordcount_into_sorter( + &mut fid_word_count_docids_sorter, + &mut document_fid_wordcount, + document_id, + )?; + } + + sorter_into_reader(fid_word_count_docids_sorter, indexer) +} + +fn drain_document_fid_wordcount_into_sorter( + fid_word_count_docids_sorter: &mut Sorter, + document_fid_wordcount: &mut HashMap, + document_id: DocumentId, +) -> Result<()> { + let mut key_buffer = Vec::new(); + + for (fid, count) in document_fid_wordcount.drain() { + if count <= 10 { + key_buffer.clear(); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + key_buffer.push(count as u8); + + fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + } + + Ok(()) +} diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs new file mode 100644 index 000000000..85453e173 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -0,0 +1,42 @@ +use std::fs::File; +use std::io; +use std::iter::FromIterator; + +use roaring::RoaringBitmap; + +use super::helpers::{ + create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::Result; + +/// Extracts the word and the documents ids where this word appear. +/// +/// Returns a grenad reader with the list of extracted words and +/// documents ids from the given chunk of docid word positions. +pub fn extract_word_docids( + mut docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_docids_sorter = create_sorter( + merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut value_buffer = Vec::new(); + while let Some((key, _value)) = docid_word_positions.next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + + let bitmap = RoaringBitmap::from_iter(Some(document_id)); + serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; + word_docids_sorter.insert(word_bytes, &value_buffer)?; + } + + sorter_into_reader(word_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs new file mode 100644 index 000000000..c7138b32a --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs @@ -0,0 +1,46 @@ +use std::fs::File; +use std::io; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::{DocumentId, Result}; +/// Extracts the word positions and the documents ids where this word appear. +/// +/// Returns a grenad reader with the list of extracted words at positions and +/// documents ids from the given chunk of docid word positions. +pub fn extract_word_level_position_docids( + mut docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_level_position_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + while let Some((key, value)) = docid_word_positions.next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let document_id = DocumentId::from_be_bytes(document_id_bytes); + + for position in read_u32_ne_bytes(value) { + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); // tree level + + // Levels are composed of left and right bounds. + key_buffer.extend_from_slice(&position.to_be_bytes()); + key_buffer.extend_from_slice(&position.to_be_bytes()); + + word_level_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; + } + } + + sorter_into_reader(word_level_position_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs new file mode 100644 index 000000000..2bc79aac5 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -0,0 +1,196 @@ +use std::cmp::Ordering; +use std::collections::{BinaryHeap, HashMap}; +use std::fs::File; +use std::time::{Duration, Instant}; +use std::{cmp, io, mem, str, vec}; + +use log::debug; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, MergeFn, +}; +use crate::proximity::{positions_proximity, MAX_DISTANCE}; +use crate::{DocumentId, Result}; + +/// Extracts the best proximity between pairs of words and the documents ids where this pair appear. +/// +/// Returns a grenad reader with the list of extracted word pairs proximities and +/// documents ids from the given chunk of docid word positions. +pub fn extract_word_pair_proximity_docids( + mut docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_pair_proximity_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut number_of_documents = 0; + let mut total_time_aggregation = Duration::default(); + let mut total_time_grenad_insert = Duration::default(); + + // This map is assumed to not consume a lot of memory. + let mut document_word_positions_heap = BinaryHeap::new(); + let mut current_document_id = None; + + while let Some((key, value)) = docid_word_positions.next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + let word = str::from_utf8(word_bytes)?; + + let curr_document_id = *current_document_id.get_or_insert(document_id); + if curr_document_id != document_id { + let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + document_word_positions_into_sorter( + curr_document_id, + document_word_positions_heap, + &mut word_pair_proximity_docids_sorter, + &mut total_time_aggregation, + &mut total_time_grenad_insert, + )?; + number_of_documents += 1; + current_document_id = Some(document_id); + } + + let word = word.to_string(); + let mut iter = read_u32_ne_bytes(value).collect::>().into_iter(); + if let Some(position) = iter.next() { + document_word_positions_heap.push(PeekedWordPosition { word, position, iter }); + } + } + + if let Some(document_id) = current_document_id { + // We must make sure that don't lose the current document field id + // word count map if we break because we reached the end of the chunk. + let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + document_word_positions_into_sorter( + document_id, + document_word_positions_heap, + &mut word_pair_proximity_docids_sorter, + &mut total_time_aggregation, + &mut total_time_grenad_insert, + )?; + } + + debug!( + "Number of documents {} + - we took {:02?} to aggregate proximities + - we took {:02?} to grenad insert those proximities", + number_of_documents, total_time_aggregation, total_time_grenad_insert, + ); + + sorter_into_reader(word_pair_proximity_docids_sorter, indexer) +} + +/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive. +/// +/// This list is used by the engine to calculate the documents containing words that are +/// close to each other. +fn document_word_positions_into_sorter<'b>( + document_id: DocumentId, + mut word_positions_heap: BinaryHeap>>, + word_pair_proximity_docids_sorter: &mut grenad::Sorter, + total_time_aggregation: &mut Duration, + total_time_grenad_insert: &mut Duration, +) -> Result<()> { + let before_aggregating = Instant::now(); + let mut word_pair_proximity = HashMap::new(); + let mut ordered_peeked_word_positions = Vec::new(); + while !word_positions_heap.is_empty() { + while let Some(peeked_word_position) = word_positions_heap.pop() { + ordered_peeked_word_positions.push(peeked_word_position); + if ordered_peeked_word_positions.len() == 7 { + break; + } + } + + if let Some((head, tail)) = ordered_peeked_word_positions.split_first() { + for PeekedWordPosition { word, position, .. } in tail { + let prox = positions_proximity(head.position, *position); + if prox > 0 && prox < MAX_DISTANCE { + word_pair_proximity + .entry((head.word.clone(), word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + + // We also compute the inverse proximity. + let prox = prox + 1; + if prox < MAX_DISTANCE { + word_pair_proximity + .entry((word.clone(), head.word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + } + } + } + + // Push the tail in the heap. + let tail_iter = ordered_peeked_word_positions.drain(1..); + word_positions_heap.extend(tail_iter); + + // Advance the head and push it in the heap. + if let Some(mut head) = ordered_peeked_word_positions.pop() { + if let Some(next_position) = head.iter.next() { + word_positions_heap.push(PeekedWordPosition { + word: head.word, + position: next_position, + iter: head.iter, + }); + } + } + } + } + + *total_time_aggregation += before_aggregating.elapsed(); + + let mut key_buffer = Vec::new(); + for ((w1, w2), prox) in word_pair_proximity { + key_buffer.clear(); + key_buffer.extend_from_slice(w1.as_bytes()); + key_buffer.push(0); + key_buffer.extend_from_slice(w2.as_bytes()); + key_buffer.push(prox as u8); + + let before_grenad_insert = Instant::now(); + word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; + *total_time_grenad_insert += before_grenad_insert.elapsed(); + } + + Ok(()) +} + +struct PeekedWordPosition { + word: String, + position: u32, + iter: I, +} + +impl Ord for PeekedWordPosition { + fn cmp(&self, other: &Self) -> Ordering { + self.position.cmp(&other.position).reverse() + } +} + +impl PartialOrd for PeekedWordPosition { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for PeekedWordPosition {} + +impl PartialEq for PeekedWordPosition { + fn eq(&self, other: &Self) -> bool { + self.position == other.position + } +} diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs new file mode 100644 index 000000000..b24c80da4 --- /dev/null +++ b/milli/src/update/index_documents/extract/mod.rs @@ -0,0 +1,199 @@ +mod extract_docid_word_positions; +mod extract_facet_number_docids; +mod extract_facet_string_docids; +mod extract_fid_docid_facet_values; +mod extract_fid_word_count_docids; +mod extract_word_docids; +mod extract_word_level_position_docids; +mod extract_word_pair_proximity_docids; + +use std::collections::HashSet; +use std::fs::File; + +use crossbeam_channel::Sender; +use rayon::prelude::*; + +use self::extract_docid_word_positions::extract_docid_word_positions; +use self::extract_facet_number_docids::extract_facet_number_docids; +use self::extract_facet_string_docids::extract_facet_string_docids; +use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; +use self::extract_fid_word_count_docids::extract_fid_word_count_docids; +use self::extract_word_docids::extract_word_docids; +use self::extract_word_level_position_docids::extract_word_level_position_docids; +use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; +use super::helpers::{ + into_clonable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, + merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, +}; +use super::{helpers, TypedChunk}; +use crate::{FieldId, Result}; + +/// Extract data for each databases from obkv documents in parallel. +/// Send data in grenad file over provided Sender. +pub(crate) fn data_from_obkv_documents( + obkv_chunks: impl Iterator>> + Send, + indexer: GrenadParameters, + lmdb_writer_sx: Sender, + searchable_fields: Option>, + faceted_fields: HashSet, +) -> Result<()> { + let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks + .par_bridge() + .map(|result| { + let documents_chunk = result.and_then(|c| unsafe { into_clonable_grenad(c) }).unwrap(); + + lmdb_writer_sx.send(TypedChunk::Documents(documents_chunk.clone())).unwrap(); + + let (docid_word_positions_chunk, docid_fid_facet_values_chunks): ( + Result<_>, + Result<_>, + ) = rayon::join( + || { + let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( + documents_chunk.clone(), + indexer.clone(), + &searchable_fields, + )?; + + // send documents_ids to DB writer + lmdb_writer_sx.send(TypedChunk::NewDocumentsIds(documents_ids)).unwrap(); + + // send docid_word_positions_chunk to DB writer + let docid_word_positions_chunk = + unsafe { into_clonable_grenad(docid_word_positions_chunk)? }; + lmdb_writer_sx + .send(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())) + .unwrap(); + Ok(docid_word_positions_chunk) + }, + || { + let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = + extract_fid_docid_facet_values( + documents_chunk.clone(), + indexer.clone(), + &faceted_fields, + )?; + + // send docid_fid_facet_numbers_chunk to DB writer + let docid_fid_facet_numbers_chunk = + unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? }; + lmdb_writer_sx + .send(TypedChunk::FieldIdDocidFacetNumbers( + docid_fid_facet_numbers_chunk.clone(), + )) + .unwrap(); + + // send docid_fid_facet_strings_chunk to DB writer + let docid_fid_facet_strings_chunk = + unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? }; + lmdb_writer_sx + .send(TypedChunk::FieldIdDocidFacetStrings( + docid_fid_facet_strings_chunk.clone(), + )) + .unwrap(); + + Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) + }, + ); + Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) + }) + .collect(); + + let ( + docid_word_positions_chunks, + (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), + ) = result?; + + spawn_extraction_task( + docid_word_positions_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_word_pair_proximity_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordPairProximityDocids, + "word-pair-proximity-docids", + ); + + spawn_extraction_task( + docid_word_positions_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_fid_word_count_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::FieldIdWordcountDocids, + "field-id-wordcount-docids", + ); + + spawn_extraction_task( + docid_word_positions_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_word_docids, + merge_roaring_bitmaps, + TypedChunk::WordDocids, + "word-docids", + ); + + spawn_extraction_task( + docid_word_positions_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_word_level_position_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordLevelPositionDocids, + "word-level-position-docids", + ); + + spawn_extraction_task( + docid_fid_facet_strings_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_facet_string_docids, + keep_first_prefix_value_merge_roaring_bitmaps, + TypedChunk::FieldIdFacetStringDocids, + "field-id-facet-string-docids", + ); + + spawn_extraction_task( + docid_fid_facet_numbers_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_facet_number_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::FieldIdFacetNumberDocids, + "field-id-facet-number-docids", + ); + + Ok(()) +} + +/// Spawn a new task to extract data for a specific DB using extract_fn. +/// Generated grenad chunks are merged using the merge_fn. +/// The result of merged chunks is serialized as TypedChunk using the serialize_fn +/// and sent into lmdb_writer_sx. +fn spawn_extraction_task( + chunks: Vec>, + indexer: GrenadParameters, + lmdb_writer_sx: Sender, + extract_fn: FE, + merge_fn: MergeFn, + serialize_fn: FS, + name: &'static str, +) where + FE: Fn(grenad::Reader, GrenadParameters) -> Result> + + Sync + + Send + + 'static, + FS: Fn(grenad::Reader) -> TypedChunk + Sync + Send + 'static, +{ + rayon::spawn(move || { + let chunks: Vec<_> = chunks + .into_par_iter() + .map(|chunk| extract_fn(chunk, indexer.clone()).unwrap()) + .collect(); + rayon::spawn(move || { + let reader = merge_readers(chunks, merge_fn, indexer).unwrap(); + lmdb_writer_sx.send(serialize_fn(reader)).unwrap(); + }); + }); +} diff --git a/milli/src/update/index_documents/helpers/clonable_mmap.rs b/milli/src/update/index_documents/helpers/clonable_mmap.rs new file mode 100644 index 000000000..b16c080ff --- /dev/null +++ b/milli/src/update/index_documents/helpers/clonable_mmap.rs @@ -0,0 +1,22 @@ +use std::sync::Arc; + +use memmap::Mmap; + +#[derive(Debug, Clone)] +pub struct ClonableMmap { + inner: Arc, +} + +impl AsRef<[u8]> for ClonableMmap { + fn as_ref(&self) -> &[u8] { + self.inner.as_ref() + } +} + +impl From for ClonableMmap { + fn from(inner: Mmap) -> ClonableMmap { + ClonableMmap { inner: Arc::new(inner) } + } +} + +pub type CursorClonableMmap = std::io::Cursor; diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs new file mode 100644 index 000000000..9dd261f73 --- /dev/null +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -0,0 +1,276 @@ +use std::borrow::Cow; +use std::fs::File; +use std::io::{self, Seek, SeekFrom}; +use std::time::Instant; + +use byte_unit::Byte; +use grenad::{CompressionType, MergerIter, Reader, Sorter}; +use heed::types::ByteSlice; +use log::debug; + +use super::{ClonableMmap, MergeFn}; +use crate::error::InternalError; +use crate::update::index_documents::WriteMethod; +use crate::Result; + +pub type CursorClonableMmap = io::Cursor; + +pub fn create_writer( + typ: grenad::CompressionType, + level: Option, + file: R, +) -> io::Result> { + let mut builder = grenad::Writer::builder(); + builder.compression_type(typ); + if let Some(level) = level { + builder.compression_level(level); + } + builder.build(file) +} + +pub fn create_sorter( + merge: MergeFn, + chunk_compression_type: grenad::CompressionType, + chunk_compression_level: Option, + max_nb_chunks: Option, + max_memory: Option, +) -> grenad::Sorter { + let mut builder = grenad::Sorter::builder(merge); + builder.chunk_compression_type(chunk_compression_type); + if let Some(level) = chunk_compression_level { + builder.chunk_compression_level(level); + } + if let Some(nb_chunks) = max_nb_chunks { + builder.max_nb_chunks(nb_chunks); + } + if let Some(memory) = max_memory { + builder.dump_threshold(memory); + builder.allow_realloc(false); + } + builder.build() +} + +pub fn sorter_into_reader( + sorter: grenad::Sorter, + indexer: GrenadParameters, +) -> Result> { + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) + })?; + sorter.write_into(&mut writer)?; + Ok(writer_into_reader(writer)?) +} + +pub fn writer_into_reader(writer: grenad::Writer) -> Result> { + let mut file = writer.into_inner()?; + file.seek(SeekFrom::Start(0))?; + grenad::Reader::new(file).map_err(Into::into) +} + +pub unsafe fn into_clonable_grenad( + reader: grenad::Reader, +) -> Result> { + let file = reader.into_inner(); + let mmap = memmap::Mmap::map(&file)?; + let cursor = io::Cursor::new(ClonableMmap::from(mmap)); + let reader = grenad::Reader::new(cursor)?; + Ok(reader) +} + +pub fn merge_readers( + readers: Vec>, + merge_fn: MergeFn, + indexer: GrenadParameters, +) -> Result> { + let mut merger_builder = grenad::MergerBuilder::new(merge_fn); + merger_builder.extend(readers); + let merger = merger_builder.build(); + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) + })?; + merger.write_into(&mut writer)?; + let reader = writer_into_reader(writer)?; + Ok(reader) +} + +#[derive(Debug, Clone, Copy)] +pub struct GrenadParameters { + pub chunk_compression_type: CompressionType, + pub chunk_compression_level: Option, + pub max_memory: Option, + pub max_nb_chunks: Option, +} + +impl Default for GrenadParameters { + fn default() -> Self { + Self { + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + max_memory: None, + max_nb_chunks: None, + } + } +} + +impl GrenadParameters { + pub fn max_memory_by_thread(&self) -> Option { + self.max_memory.map(|max_memory| max_memory / rayon::current_num_threads()) + } +} + +/// Returns an iterator that outputs grenad readers of obkv documents +/// with a maximum size of approximately `documents_chunks_size`. +/// +/// The grenad obkv entries are composed of an incremental document id big-endian +/// encoded as the key and an obkv object with an `u8` for the field as the key +/// and a simple UTF-8 encoded string as the value. +pub fn grenad_obkv_into_chunks( + mut reader: grenad::Reader, + indexer: GrenadParameters, + log_frequency: Option, + documents_chunk_size: Byte, +) -> Result>>> { + let mut document_count = 0; + let mut continue_reading = true; + + let indexer_clone = indexer.clone(); + let mut transposer = move || { + if !continue_reading { + return Ok(None); + } + + let mut current_chunk_size = 0u64; + let mut obkv_documents = tempfile::tempfile().and_then(|file| { + create_writer( + indexer_clone.chunk_compression_type, + indexer_clone.chunk_compression_level, + file, + ) + })?; + + while let Some((document_id, obkv)) = reader.next()? { + obkv_documents.insert(document_id, obkv)?; + current_chunk_size += document_id.len() as u64 + obkv.len() as u64; + + document_count += 1; + if log_frequency.map_or(false, |log_frequency| document_count % log_frequency == 0) { + debug!("reached {} chunked documents", document_count); + } + + if current_chunk_size >= documents_chunk_size.get_bytes() { + return writer_into_reader(obkv_documents).map(Some); + } + } + + continue_reading = false; + writer_into_reader(obkv_documents).map(Some) + }; + + Ok(std::iter::from_fn(move || { + let result = transposer().transpose(); + if result.as_ref().map_or(false, |r| r.is_ok()) { + debug!( + "A new chunk of approximately {} has been generated", + documents_chunk_size.get_appropriate_unit(true), + ); + } + result + })) +} + +pub fn write_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + mut reader: Reader, + merge: MergeFn, + method: WriteMethod, +) -> Result<()> { + debug!("Writing MTBL stores..."); + let before = Instant::now(); + + match method { + WriteMethod::Append => { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + while let Some((k, v)) = reader.next()? { + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; + } + } + WriteMethod::GetMergePut => { + while let Some((k, v)) = reader.next()? { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; + match iter.next().transpose()? { + Some((key, old_val)) if key == k => { + let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; + let val = merge(k, &vals)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(k, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + } + } + } + + debug!("MTBL stores merged in {:.02?}!", before.elapsed()); + Ok(()) +} + +pub fn sorter_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + sorter: Sorter, + merge: MergeFn, + method: WriteMethod, +) -> Result<()> { + debug!("Writing MTBL sorter..."); + let before = Instant::now(); + + merger_iter_into_lmdb_database(wtxn, database, sorter.into_merger_iter()?, merge, method)?; + + debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); + Ok(()) +} + +fn merger_iter_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + mut sorter: MergerIter, + merge: MergeFn, + method: WriteMethod, +) -> Result<()> { + match method { + WriteMethod::Append => { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + while let Some((k, v)) = sorter.next()? { + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; + } + } + WriteMethod::GetMergePut => { + while let Some((k, v)) = sorter.next()? { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; + match iter.next().transpose()? { + Some((key, old_val)) if key == k => { + let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; + let val = merge(k, &vals).map_err(|_| { + // TODO just wrap this error? + InternalError::IndexingMergingKeys { process: "get-put-merge" } + })?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(k, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + } + } + } + + Ok(()) +} diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs new file mode 100644 index 000000000..6a592e54d --- /dev/null +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -0,0 +1,171 @@ +use std::borrow::Cow; +use std::io; +use std::result::Result as StdResult; + +use roaring::RoaringBitmap; + +use super::read_u32_ne_bytes; +use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::Result; + +pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result>; + +pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let capacity = values.iter().map(|v| v.len()).sum::(); + let mut output = Vec::with_capacity(capacity); + values.iter().for_each(|integers| output.extend_from_slice(integers)); + Ok(Cow::Owned(output)) + } +} + +pub fn roaring_bitmap_from_u32s_array(slice: &[u8]) -> RoaringBitmap { + read_u32_ne_bytes(slice).collect() +} + +pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec) -> io::Result<()> { + buffer.clear(); + buffer.reserve(bitmap.serialized_size()); + bitmap.serialize_into(buffer) +} + +pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let merged = values + .iter() + .map(AsRef::as_ref) + .map(RoaringBitmap::deserialize_from) + .map(StdResult::unwrap) + .reduce(|a, b| a | b) + .unwrap(); + let mut buffer = Vec::new(); + serialize_roaring_bitmap(&merged, &mut buffer)?; + Ok(Cow::Owned(buffer)) + } +} + +pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( + _key: &[u8], + values: &[Cow<'a, [u8]>], +) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let original = decode_prefix_string(&values[0]).unwrap().0; + let merged_bitmaps = values + .iter() + .map(AsRef::as_ref) + .map(decode_prefix_string) + .map(Option::unwrap) + .map(|(_, bitmap_bytes)| bitmap_bytes) + .map(RoaringBitmap::deserialize_from) + .map(StdResult::unwrap) + .reduce(|a, b| a | b) + .unwrap(); + + let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); + let mut buffer = Vec::with_capacity(cap); + encode_prefix_string(original, &mut buffer)?; + merged_bitmaps.serialize_into(&mut buffer)?; + Ok(Cow::Owned(buffer)) + } +} + +pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + Ok(values[0].clone()) +} + +/// Only the last value associated with an id is kept. +pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { + Ok(obkvs.last().unwrap().clone()) +} + +/// Merge all the obks in the order we see them. +pub fn merge_obkvs<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { + Ok(obkvs + .into_iter() + .cloned() + .reduce(|acc, current| { + let first = obkv::KvReader::new(&acc); + let second = obkv::KvReader::new(¤t); + let mut buffer = Vec::new(); + merge_two_obkvs(first, second, &mut buffer); + Cow::from(buffer) + }) + .unwrap()) +} + +pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec) { + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + buffer.clear(); + + let mut writer = obkv::KvWriter::new(buffer); + for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { + match eob { + Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), + } + } + + writer.finish().unwrap(); +} + +pub fn merge_cbo_roaring_bitmaps<'a>( + _key: &[u8], + values: &[Cow<'a, [u8]>], +) -> Result> { + match values.split_first().unwrap() { + (head, []) => Ok(head.clone()), + (head, tail) => { + let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; + + for value in tail { + head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; + } + + let mut vec = Vec::new(); + CboRoaringBitmapCodec::serialize_into(&head, &mut vec); + Ok(Cow::from(vec)) + } + } +} + +// /// Uses the FacetStringLevelZeroValueCodec to merge the values. +// pub fn tuple_string_cbo_roaring_bitmap_merge<'a>( +// _key: &[u8], +// values: &[Cow<[u8]>], +// ) -> Result> { +// let (head, tail) = values.split_first().unwrap(); +// let (head_string, mut head_rb) = FacetStringLevelZeroValueCodec::bytes_decode(&head[..]) +// .ok_or(SerializationError::Decoding { db_name: None })?; + +// for value in tail { +// let (_string, rb) = FacetStringLevelZeroValueCodec::bytes_decode(&value[..]) +// .ok_or(SerializationError::Decoding { db_name: None })?; +// head_rb |= rb; +// } + +// FacetStringLevelZeroValueCodec::bytes_encode(&(head_string, head_rb)) +// .map(|cow| cow.into_owned()) +// .ok_or(SerializationError::Encoding { db_name: None }) +// .map_err(Into::into) +// } + +// pub fn cbo_roaring_bitmap_merge<'a>(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { +// let (head, tail) = values.split_first().unwrap(); +// let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; + +// for value in tail { +// head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; +// } + +// let mut vec = Vec::new(); +// CboRoaringBitmapCodec::serialize_into(&head, &mut vec); +// Ok(vec) +// } diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs new file mode 100644 index 000000000..baacb0a1b --- /dev/null +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -0,0 +1,49 @@ +mod clonable_mmap; +mod grenad_helpers; +mod merge_functions; + +use std::convert::{TryFrom, TryInto}; + +pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; +pub use grenad_helpers::{ + create_sorter, create_writer, grenad_obkv_into_chunks, into_clonable_grenad, merge_readers, + sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, + GrenadParameters, +}; +pub use merge_functions::{ + concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, + merge_cbo_roaring_bitmaps, merge_obkvs, merge_roaring_bitmaps, merge_two_obkvs, + roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, +}; + +pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { + key.as_ref().len() <= 511 +} + +/// Divides one slice into two at an index, returns `None` if mid is out of bounds. +pub fn try_split_at(slice: &[T], mid: usize) -> Option<(&[T], &[T])> { + if mid <= slice.len() { + Some(slice.split_at(mid)) + } else { + None + } +} + +/// Divides one slice into an array and the tail at an index, +/// returns `None` if `N` is out of bounds. +pub fn try_split_array_at(slice: &[T]) -> Option<([T; N], &[T])> +where + [T; N]: for<'a> TryFrom<&'a [T]>, +{ + let (head, tail) = try_split_at(slice, N)?; + let head = head.try_into().ok()?; + Some((head, tail)) +} + +// pub fn pretty_thousands, T: fmt::Display>(number: A) -> String { +// thousands::Separable::separate_with_spaces(number.borrow()) +// } + +pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { + bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) +} diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs deleted file mode 100644 index 7e5d0b581..000000000 --- a/milli/src/update/index_documents/merge_function.rs +++ /dev/null @@ -1,106 +0,0 @@ -use std::borrow::Cow; -use std::result::Result as StdResult; - -use fst::IntoStreamer; -use heed::{BytesDecode, BytesEncode}; -use roaring::RoaringBitmap; - -use crate::error::SerializationError; -use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::Result; - -/// Only the last value associated with an id is kept. -pub fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result> { - Ok(obkvs.last().unwrap().clone().into_owned()) -} - -/// Merge all the obks in the order we see them. -pub fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result> { - let mut iter = obkvs.iter(); - let first = iter.next().map(|b| b.clone().into_owned()).unwrap(); - Ok(iter.fold(first, |acc, current| { - let first = obkv::KvReader::new(&acc); - let second = obkv::KvReader::new(current); - let mut buffer = Vec::new(); - merge_two_obkvs(first, second, &mut buffer); - buffer - })) -} - -// Union of multiple FSTs -pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { - let fsts = values.iter().map(fst::Set::new).collect::, _>>()?; - let op_builder: fst::set::OpBuilder = fsts.iter().map(|fst| fst.into_stream()).collect(); - let op = op_builder.r#union(); - - let mut build = fst::SetBuilder::memory(); - build.extend_stream(op.into_stream()).unwrap(); - Ok(build.into_inner().unwrap()) -} - -pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { - Ok(values.first().unwrap().to_vec()) -} - -pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec) { - use itertools::merge_join_by; - use itertools::EitherOrBoth::{Both, Left, Right}; - - buffer.clear(); - - let mut writer = obkv::KvWriter::new(buffer); - for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { - match eob { - Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), - } - } - - writer.finish().unwrap(); -} - -pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { - let (head, tail) = values.split_first().unwrap(); - let mut head = RoaringBitmap::deserialize_from(&head[..])?; - - for value in tail { - head |= RoaringBitmap::deserialize_from(&value[..])?; - } - - let mut vec = Vec::with_capacity(head.serialized_size()); - head.serialize_into(&mut vec)?; - Ok(vec) -} - -/// Uses the FacetStringLevelZeroValueCodec to merge the values. -pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { - let (head, tail) = values.split_first().unwrap(); - let (head_string, mut head_rb) = - FacetStringLevelZeroValueCodec::::bytes_decode(&head[..]) - .ok_or(SerializationError::Decoding { db_name: None })?; - - for value in tail { - let (_string, rb) = - FacetStringLevelZeroValueCodec::::bytes_decode(&value[..]) - .ok_or(SerializationError::Decoding { db_name: None })?; - head_rb |= rb; - } - - FacetStringLevelZeroValueCodec::::bytes_encode(&(head_string, head_rb)) - .map(|cow| cow.into_owned()) - .ok_or(SerializationError::Encoding { db_name: None }) - .map_err(Into::into) -} - -pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { - let (head, tail) = values.split_first().unwrap(); - let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; - - for value in tail { - head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; - } - - let mut vec = Vec::new(); - CboRoaringBitmapCodec::serialize_into(&head, &mut vec); - Ok(vec) -} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index afae8cae9..4f488337c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1,240 +1,41 @@ -use std::borrow::Cow; +mod extract; +mod helpers; +mod transform; +mod typed_chunk; + use std::collections::HashSet; -use std::fs::File; -use std::io::{self, BufRead, BufReader, Seek, SeekFrom}; +use std::io::{self, BufRead, BufReader}; +use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; -use std::result::Result as StdResult; -use std::str; -use std::sync::mpsc::sync_channel; use std::time::Instant; -use bstr::ByteSlice as _; +use byte_unit::Byte; use chrono::Utc; -use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer}; -use heed::types::ByteSlice; -use log::{debug, error, info}; -use memmap::Mmap; -use rayon::prelude::*; +use crossbeam_channel::{Receiver, Sender}; +use grenad::{self, CompressionType}; +use log::{debug, info}; use rayon::ThreadPool; +use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; +use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; -pub use self::merge_function::{ - cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, - tuple_string_cbo_roaring_bitmap_merge, +pub use self::helpers::{ + create_sorter, create_writer, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, + sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, }; -use self::store::{Readers, Store}; +use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use super::UpdateBuilder; -use crate::error::{Error, InternalError}; use crate::update::{ - Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, + Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordsLevelPositions, WordsPrefixesFst, }; -use crate::{Index, MergeFn, Result}; - -mod merge_function; -mod store; -mod transform; +use crate::{Index, Result}; #[derive(Debug, Serialize, Deserialize, Clone)] pub struct DocumentAdditionResult { pub nb_documents: usize, } -#[derive(Debug, Copy, Clone)] -pub enum WriteMethod { - Append, - GetMergePut, -} - -pub fn create_writer( - typ: CompressionType, - level: Option, - file: File, -) -> io::Result> { - let mut builder = Writer::builder(); - builder.compression_type(typ); - if let Some(level) = level { - builder.compression_level(level); - } - builder.build(file) -} - -pub fn create_sorter( - merge: MergeFn, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, - chunk_fusing_shrink_size: Option, - max_nb_chunks: Option, - max_memory: Option, -) -> Sorter> { - let mut builder = Sorter::builder(merge); - if let Some(shrink_size) = chunk_fusing_shrink_size { - builder.file_fusing_shrink_size(shrink_size); - } - builder.chunk_compression_type(chunk_compression_type); - if let Some(level) = chunk_compression_level { - builder.chunk_compression_level(level); - } - if let Some(nb_chunks) = max_nb_chunks { - builder.max_nb_chunks(nb_chunks); - } - if let Some(memory) = max_memory { - builder.max_memory(memory); - } - builder.build() -} - -pub fn writer_into_reader( - writer: Writer, - shrink_size: Option, -) -> Result> { - let mut file = writer.into_inner()?; - file.seek(SeekFrom::Start(0))?; - let file = if let Some(shrink_size) = shrink_size { - FileFuse::builder().shrink_size(shrink_size).build(file) - } else { - FileFuse::new(file) - }; - Reader::new(file).map_err(Into::into) -} - -pub fn merge_readers( - sources: Vec>, - merge: MergeFn, -) -> Merger> { - let mut builder = Merger::builder(merge); - builder.extend(sources); - builder.build() -} - -pub fn merge_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - sources: Vec>, - merge: MergeFn, - method: WriteMethod, -) -> Result<()> -where - Error: From, -{ - debug!("Merging {} MTBL stores...", sources.len()); - let before = Instant::now(); - - let merger = merge_readers(sources, merge); - merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?; - - debug!("MTBL stores merged in {:.02?}!", before.elapsed()); - Ok(()) -} - -pub fn write_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - mut reader: Reader, - merge: MergeFn, - method: WriteMethod, -) -> Result<()> -where - Error: From, -{ - debug!("Writing MTBL stores..."); - let before = Instant::now(); - - match method { - WriteMethod::Append => { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = reader.next()? { - // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; - } - } - WriteMethod::GetMergePut => { - while let Some((k, v)) = reader.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; - let val = merge(k, &vals)?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - } - } - } - - debug!("MTBL stores merged in {:.02?}!", before.elapsed()); - Ok(()) -} - -pub fn sorter_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - sorter: Sorter>, - merge: MergeFn, - method: WriteMethod, -) -> Result<()> -where - Error: From, - Error: From>, -{ - debug!("Writing MTBL sorter..."); - let before = Instant::now(); - - merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?; - - debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); - Ok(()) -} - -fn merger_iter_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - mut sorter: MergerIter>, - merge: MergeFn, - method: WriteMethod, -) -> Result<()> -where - Error: From, -{ - match method { - WriteMethod::Append => { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = sorter.next()? { - // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; - } - } - WriteMethod::GetMergePut => { - while let Some((k, v)) = sorter.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).map_err(|_| { - // TODO just wrap this error? - InternalError::IndexingMergingKeys { process: "get-put-merge" } - })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - } - } - } - - Ok(()) -} - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[non_exhaustive] pub enum IndexDocumentsMethod { @@ -247,6 +48,12 @@ pub enum IndexDocumentsMethod { UpdateDocuments, } +#[derive(Debug, Copy, Clone)] +pub enum WriteMethod { + Append, + GetMergePut, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[non_exhaustive] pub enum UpdateFormat { @@ -262,12 +69,11 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, pub(crate) log_every_n: Option, + pub(crate) documents_chunk_size: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, - pub(crate) linked_hash_map_size: Option, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, facet_level_group_size: Option, facet_min_level_size: Option, @@ -291,12 +97,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { wtxn, index, log_every_n: None, + documents_chunk_size: None, max_nb_chunks: None, max_memory: None, - linked_hash_map_size: None, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, thread_pool: None, facet_level_group_size: None, facet_min_level_size: None, @@ -344,14 +149,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let before_transform = Instant::now(); let update_id = self.update_id; let progress_callback = |step| progress_callback(step, update_id); - let transform = Transform { rtxn: &self.wtxn, index: self.index, log_every_n: self.log_every_n, chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, - chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, max_nb_chunks: self.max_nb_chunks, max_memory: self.max_memory, index_documents_method: self.update_method, @@ -378,8 +181,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { where F: Fn(UpdateIndexingStep) + Sync, { - let before_indexing = Instant::now(); - let TransformOutput { primary_key, fields_ids_map, @@ -395,6 +196,65 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // up to date field map. self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + let backup_pool; + let pool = match self.thread_pool { + Some(pool) => pool, + #[cfg(not(test))] + None => { + // We initialize a bakcup pool with the default + // settings if none have already been set. + backup_pool = rayon::ThreadPoolBuilder::new().build()?; + &backup_pool + } + #[cfg(test)] + None => { + // We initialize a bakcup pool with the default + // settings if none have already been set. + backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?; + &backup_pool + } + }; + + let documents_file = grenad::Reader::new(documents_file)?; + + // create LMDB writer channel + let (lmdb_writer_sx, lmdb_writer_rx): (Sender, Receiver) = + crossbeam_channel::unbounded(); + + // get searchable fields for word databases + let searchable_fields = + self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); + // get filterable fields for facet databases + let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + + // Run extraction pipeline in parallel. + pool.install(|| { + let params = GrenadParameters { + chunk_compression_type: self.chunk_compression_type, + chunk_compression_level: self.chunk_compression_level, + max_memory: self.max_memory, + max_nb_chunks: self.max_nb_chunks, // default value, may be chosen. + }; + + // split obkv file into several chuncks + let mut chunk_iter = grenad_obkv_into_chunks( + documents_file, + params.clone(), + self.log_every_n, + Byte::from_bytes(self.documents_chunk_size.unwrap_or(1024 * 1024 * 128) as u64), // 128MiB + ) + .unwrap(); + // extract all databases from the chunked obkv douments + extract::data_from_obkv_documents( + &mut chunk_iter, + params, + lmdb_writer_sx, + searchable_fields, + faceted_fields, + ) + .unwrap(); + }); + // We delete the documents that this document addition replaces. This way we are // able to simply insert all the documents even if they already exist in the database. if !replaced_documents_ids.is_empty() { @@ -402,10 +262,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { log_every_n: self.log_every_n, max_nb_chunks: self.max_nb_chunks, max_memory: self.max_memory, - linked_hash_map_size: self.linked_hash_map_size, chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, - chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, thread_pool: self.thread_pool, update_id: self.update_id, }; @@ -416,190 +274,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { debug!("{} documents actually deleted", deleted_documents_count); } - if documents_count == 0 { - return Ok(()); + let index_documents_ids = self.index.documents_ids(self.wtxn)?; + let index_is_empty = index_documents_ids.len() == 0; + let mut final_documents_ids = RoaringBitmap::new(); + + for typed_chunk in lmdb_writer_rx { + let docids = + write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?; + final_documents_ids |= docids; + debug!( + "We have seen {} documents on {} total document so far", + final_documents_ids.len(), + documents_count + ); } - let bytes = unsafe { Mmap::map(&documents_file)? }; - let documents = grenad::Reader::new(bytes.as_bytes()).unwrap(); - - // The enum which indicates the type of the readers - // merges that are potentially done on different threads. - enum DatabaseType { - Main, - WordDocids, - WordLevel0PositionDocids, - FieldIdWordCountDocids, - FacetLevel0NumbersDocids, - } - - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; - let searchable_fields: HashSet<_> = match self.index.searchable_fields_ids(self.wtxn)? { - Some(fields) => fields.iter().copied().collect(), - None => fields_ids_map.iter().map(|(id, _name)| id).collect(), - }; - - let stop_words = self.index.stop_words(self.wtxn)?; - let stop_words = stop_words.as_ref(); - let linked_hash_map_size = self.linked_hash_map_size; - let max_nb_chunks = self.max_nb_chunks; - let max_memory = self.max_memory; - let chunk_compression_type = self.chunk_compression_type; - let chunk_compression_level = self.chunk_compression_level; - let log_every_n = self.log_every_n; - let chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - - let backup_pool; - let pool = match self.thread_pool { - Some(pool) => pool, - None => { - // We initialize a bakcup pool with the default - // settings if none have already been set. - backup_pool = rayon::ThreadPoolBuilder::new().build()?; - &backup_pool - } - }; - - let readers = pool.install(|| { - let num_threads = rayon::current_num_threads(); - let max_memory_by_job = max_memory.map(|mm| mm / num_threads); - - let readers = rayon::iter::repeatn(documents, num_threads) - .enumerate() - .map(|(i, documents)| { - let store = Store::new( - searchable_fields.clone(), - faceted_fields.clone(), - linked_hash_map_size, - max_nb_chunks, - max_memory_by_job, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - stop_words, - )?; - store.index( - documents, - documents_count, - i, - num_threads, - log_every_n, - &progress_callback, - ) - }) - .collect::, _>>()?; - - let mut main_readers = Vec::with_capacity(readers.len()); - let mut word_docids_readers = Vec::with_capacity(readers.len()); - let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); - let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); - let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); - let mut field_id_word_count_docids_readers = Vec::with_capacity(readers.len()); - let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len()); - let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len()); - let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len()); - let mut field_id_docid_facet_strings_readers = Vec::with_capacity(readers.len()); - let mut documents_readers = Vec::with_capacity(readers.len()); - readers.into_iter().for_each(|readers| { - let Readers { - main, - word_docids, - docid_word_positions, - words_pairs_proximities_docids, - word_level_position_docids, - field_id_word_count_docids, - facet_field_numbers_docids, - facet_field_strings_docids, - field_id_docid_facet_numbers, - field_id_docid_facet_strings, - documents, - } = readers; - main_readers.push(main); - word_docids_readers.push(word_docids); - docid_word_positions_readers.push(docid_word_positions); - words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); - word_level_position_docids_readers.push(word_level_position_docids); - field_id_word_count_docids_readers.push(field_id_word_count_docids); - facet_field_numbers_docids_readers.push(facet_field_numbers_docids); - facet_field_strings_docids_readers.push(facet_field_strings_docids); - field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers); - field_id_docid_facet_strings_readers.push(field_id_docid_facet_strings); - documents_readers.push(documents); - }); - - // This is the function that merge the readers - // by using the given merge function. - let merge_readers = move |readers, merge| { - let mut writer = tempfile::tempfile().and_then(|f| { - create_writer(chunk_compression_type, chunk_compression_level, f) - })?; - let merger = merge_readers(readers, merge); - merger.write_into(&mut writer)?; - writer_into_reader(writer, chunk_fusing_shrink_size) - }; - - // The enum and the channel which is used to transfert - // the readers merges potentially done on another thread. - let (sender, receiver) = sync_channel(2); - - debug!("Merging the main, word docids and words pairs proximity docids in parallel..."); - rayon::spawn(move || { - vec![ - (DatabaseType::Main, main_readers, fst_merge as MergeFn<_>), - (DatabaseType::WordDocids, word_docids_readers, roaring_bitmap_merge), - ( - DatabaseType::FacetLevel0NumbersDocids, - facet_field_numbers_docids_readers, - cbo_roaring_bitmap_merge, - ), - ( - DatabaseType::WordLevel0PositionDocids, - word_level_position_docids_readers, - cbo_roaring_bitmap_merge, - ), - ( - DatabaseType::FieldIdWordCountDocids, - field_id_word_count_docids_readers, - cbo_roaring_bitmap_merge, - ), - ] - .into_par_iter() - .for_each(|(dbtype, readers, merge)| { - let result = merge_readers(readers, merge); - if let Err(e) = sender.send((dbtype, result)) { - error!("sender error: {}", e); - } - }); - }); - - Ok(( - receiver, - docid_word_positions_readers, - documents_readers, - words_pairs_proximities_docids_readers, - facet_field_strings_docids_readers, - field_id_docid_facet_numbers_readers, - field_id_docid_facet_strings_readers, - )) as Result<_> - })?; - - let ( - receiver, - docid_word_positions_readers, - documents_readers, - words_pairs_proximities_docids_readers, - facet_field_strings_docids_readers, - field_id_docid_facet_numbers_readers, - field_id_docid_facet_strings_readers, - ) = readers; - - let mut documents_ids = self.index.documents_ids(self.wtxn)?; - let contains_documents = !documents_ids.is_empty(); - let write_method = - if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append }; - - debug!("Writing using the write method: {:?}", write_method); - // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; @@ -609,180 +298,24 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the external documents ids into the main database. self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; - // We merge the new documents ids with the existing ones. - documents_ids |= new_documents_ids; - documents_ids |= replaced_documents_ids; - self.index.put_documents_ids(self.wtxn, &documents_ids)?; + let all_documents_ids = index_documents_ids | new_documents_ids; + self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; - let mut database_count = 0; - let total_databases = 11; - - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: 0, - total_databases, - }); - - debug!("Inserting the docid word positions into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.docid_word_positions.as_polymorph(), - docid_word_positions_readers, - keep_first, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - debug!("Inserting the documents into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.documents.as_polymorph(), - documents_readers, - keep_first, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - debug!("Writing the facet id string docids into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.facet_id_string_docids.as_polymorph(), - facet_field_strings_docids_readers, - tuple_string_cbo_roaring_bitmap_merge, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - debug!("Writing the field id docid facet numbers into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.field_id_docid_facet_f64s.as_polymorph(), - field_id_docid_facet_numbers_readers, - keep_first, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - debug!("Writing the field id docid facet strings into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.field_id_docid_facet_strings.as_polymorph(), - field_id_docid_facet_strings_readers, - keep_first, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - debug!("Writing the words pairs proximities docids into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.word_pair_proximity_docids.as_polymorph(), - words_pairs_proximities_docids_readers, - cbo_roaring_bitmap_merge, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - for (db_type, result) in receiver { - let content = result?; - match db_type { - DatabaseType::Main => { - debug!("Writing the main elements into LMDB on disk..."); - write_into_lmdb_database( - self.wtxn, - self.index.main, - content, - fst_merge, - WriteMethod::GetMergePut, - )?; - } - DatabaseType::WordDocids => { - debug!("Writing the words docids into LMDB on disk..."); - let db = *self.index.word_docids.as_polymorph(); - write_into_lmdb_database( - self.wtxn, - db, - content, - roaring_bitmap_merge, - write_method, - )?; - } - DatabaseType::FacetLevel0NumbersDocids => { - debug!("Writing the facet numbers docids into LMDB on disk..."); - let db = *self.index.facet_id_f64_docids.as_polymorph(); - write_into_lmdb_database( - self.wtxn, - db, - content, - cbo_roaring_bitmap_merge, - write_method, - )?; - } - DatabaseType::FieldIdWordCountDocids => { - debug!("Writing the field id word count docids into LMDB on disk..."); - let db = *self.index.field_id_word_count_docids.as_polymorph(); - write_into_lmdb_database( - self.wtxn, - db, - content, - cbo_roaring_bitmap_merge, - write_method, - )?; - } - DatabaseType::WordLevel0PositionDocids => { - debug!("Writing the word level 0 positions docids into LMDB on disk..."); - let db = *self.index.word_level_position_docids.as_polymorph(); - write_into_lmdb_database( - self.wtxn, - db, - content, - cbo_roaring_bitmap_merge, - write_method, - )?; - } - } - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - } + self.execute_prefix_databases(progress_callback) + } + pub fn execute_prefix_databases( + self, + // output: TransformOutput, + progress_callback: F, + ) -> Result<()> + where + F: Fn(UpdateIndexingStep) + Sync, + { // Run the facets update operation. let mut builder = Facets::new(self.wtxn, self.index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; if let Some(value) = self.facet_level_group_size { builder.level_group_size(value); } @@ -805,7 +338,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut builder = WordPrefixDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; builder.execute()?; @@ -814,7 +346,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; builder.execute()?; @@ -823,7 +354,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut builder = WordsLevelPositions::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; if let Some(value) = self.words_positions_level_group_size { builder.level_group_size(value); } @@ -832,10 +362,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; - debug_assert_eq!(database_count, total_databases); - - info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); - Ok(()) } } diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index a2aa26e19..e69de29bb 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -1,985 +0,0 @@ -use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap, HashSet}; -use std::convert::{TryFrom, TryInto}; -use std::fs::File; -use std::iter::FromIterator; -use std::time::Instant; -use std::{cmp, iter}; - -use bstr::ByteSlice as _; -use concat_arrays::concat_arrays; -use fst::Set; -use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer}; -use heed::BytesEncode; -use linked_hash_map::LinkedHashMap; -use log::{debug, info, warn}; -use meilisearch_tokenizer::token::SeparatorKind; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; -use ordered_float::OrderedFloat; -use roaring::RoaringBitmap; -use serde_json::Value; -use tempfile::tempfile; - -use super::merge_function::{ - cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, - tuple_string_cbo_roaring_bitmap_merge, -}; -use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; -use crate::error::{Error, InternalError, SerializationError}; -use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, -}; -use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; -use crate::update::UpdateIndexingStep; -use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32}; - -const LMDB_MAX_KEY_LENGTH: usize = 511; -const ONE_KILOBYTE: usize = 1024 * 1024; - -const MAX_POSITION: usize = 1000; -const WORDS_FST_KEY: &[u8] = crate::index::main_key::WORDS_FST_KEY.as_bytes(); - -pub struct Readers { - pub main: Reader, - pub word_docids: Reader, - pub docid_word_positions: Reader, - pub words_pairs_proximities_docids: Reader, - pub word_level_position_docids: Reader, - pub field_id_word_count_docids: Reader, - pub facet_field_numbers_docids: Reader, - pub facet_field_strings_docids: Reader, - pub field_id_docid_facet_numbers: Reader, - pub field_id_docid_facet_strings: Reader, - pub documents: Reader, -} - -pub struct Store<'s, A> { - // Indexing parameters - searchable_fields: HashSet, - filterable_fields: HashSet, - // Caches - word_docids: LinkedHashMap, RoaringBitmap>, - word_docids_limit: usize, - field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, - words_pairs_proximities_docids: - LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, - words_pairs_proximities_docids_limit: usize, - facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, - facet_field_string_docids: LinkedHashMap<(FieldId, String), (String, RoaringBitmap)>, - facet_field_value_docids_limit: usize, - // MTBL parameters - chunk_compression_type: CompressionType, - chunk_compression_level: Option, - chunk_fusing_shrink_size: Option, - // MTBL sorters - main_sorter: Sorter>, - word_docids_sorter: Sorter>, - words_pairs_proximities_docids_sorter: Sorter>, - word_level_position_docids_sorter: Sorter>, - field_id_word_count_docids_sorter: Sorter>, - facet_field_numbers_docids_sorter: Sorter>, - facet_field_strings_docids_sorter: Sorter>, - field_id_docid_facet_numbers_sorter: Sorter>, - field_id_docid_facet_strings_sorter: Sorter>, - // MTBL writers - docid_word_positions_writer: Writer, - documents_writer: Writer, - // tokenizer - analyzer: Analyzer<'s, A>, -} - -impl<'s, A: AsRef<[u8]>> Store<'s, A> { - pub fn new( - searchable_fields: HashSet, - filterable_fields: HashSet, - linked_hash_map_size: Option, - max_nb_chunks: Option, - max_memory: Option, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, - chunk_fusing_shrink_size: Option, - stop_words: Option<&'s Set>, - ) -> Result { - // We divide the max memory by the number of sorter the Store have. - let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); - let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); - - let main_sorter = create_sorter( - fst_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let word_docids_sorter = create_sorter( - roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let words_pairs_proximities_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let word_level_position_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let field_id_word_count_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let facet_field_numbers_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let facet_field_strings_docids_sorter = create_sorter( - tuple_string_cbo_roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let field_id_docid_facet_numbers_sorter = create_sorter( - keep_first, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - Some(1024 * 1024 * 1024), // 1MB - ); - let field_id_docid_facet_strings_sorter = create_sorter( - keep_first, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - Some(1024 * 1024 * 1024), // 1MB - ); - - let documents_writer = tempfile() - .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; - let docid_word_positions_writer = tempfile() - .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; - - let mut config = AnalyzerConfig::default(); - if let Some(stop_words) = stop_words { - config.stop_words(stop_words); - } - let analyzer = Analyzer::new(config); - - Ok(Store { - // Indexing parameters. - searchable_fields, - filterable_fields, - // Caches - word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), - field_id_word_count_docids: HashMap::new(), - word_docids_limit: linked_hash_map_size, - words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), - words_pairs_proximities_docids_limit: linked_hash_map_size, - facet_field_number_docids: LinkedHashMap::with_capacity(linked_hash_map_size), - facet_field_string_docids: LinkedHashMap::with_capacity(linked_hash_map_size), - facet_field_value_docids_limit: linked_hash_map_size, - // MTBL parameters - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - // MTBL sorters - main_sorter, - word_docids_sorter, - words_pairs_proximities_docids_sorter, - word_level_position_docids_sorter, - field_id_word_count_docids_sorter, - facet_field_numbers_docids_sorter, - facet_field_strings_docids_sorter, - field_id_docid_facet_numbers_sorter, - field_id_docid_facet_strings_sorter, - // MTBL writers - docid_word_positions_writer, - documents_writer, - // tokenizer - analyzer, - }) - } - - // Save the documents ids under the position and word we have seen it. - fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> { - // if get_refresh finds the element it is assured to be at the end of the linked hash map. - match self.word_docids.get_refresh(word.as_bytes()) { - Some(old) => { - old.insert(id); - } - None => { - let word_vec = SmallVec32::from(word.as_bytes()); - // A newly inserted element is append at the end of the linked hash map. - self.word_docids.insert(word_vec, RoaringBitmap::from_iter(Some(id))); - // If the word docids just reached it's capacity we must make sure to remove - // one element, this way next time we insert we doesn't grow the capacity. - if self.word_docids.len() == self.word_docids_limit { - // Removing the front element is equivalent to removing the LRU element. - let lru = self.word_docids.pop_front(); - Self::write_word_docids(&mut self.word_docids_sorter, lru)?; - } - } - } - Ok(()) - } - - fn insert_facet_number_values_docid( - &mut self, - field_id: FieldId, - value: OrderedFloat, - id: DocumentId, - ) -> Result<()> { - let sorter = &mut self.field_id_docid_facet_numbers_sorter; - Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; - - let key = (field_id, value); - // if get_refresh finds the element it is assured to be at the end of the linked hash map. - match self.facet_field_number_docids.get_refresh(&key) { - Some(old) => { - old.insert(id); - } - None => { - // A newly inserted element is append at the end of the linked hash map. - self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); - // If the word docids just reached it's capacity we must make sure to remove - // one element, this way next time we insert we doesn't grow the capacity. - if self.facet_field_number_docids.len() == self.facet_field_value_docids_limit { - // Removing the front element is equivalent to removing the LRU element. - Self::write_facet_field_number_docids( - &mut self.facet_field_numbers_docids_sorter, - self.facet_field_number_docids.pop_front(), - )?; - } - } - } - - Ok(()) - } - - // Save the documents ids under the facet field id and value we have seen it. - fn insert_facet_string_values_docid( - &mut self, - field_id: FieldId, - normalized_value: String, - original_value: String, - id: DocumentId, - ) -> Result<()> { - if normalized_value.is_empty() { - return Ok(()); - } - - let sorter = &mut self.field_id_docid_facet_strings_sorter; - Self::write_field_id_docid_facet_string_value( - sorter, - field_id, - id, - &normalized_value, - &original_value, - )?; - - let key = (field_id, normalized_value); - // if get_refresh finds the element it is assured to be at the end of the linked hash map. - match self.facet_field_string_docids.get_refresh(&key) { - Some((_original_value, old)) => { - old.insert(id); - } - None => { - // A newly inserted element is append at the end of the linked hash map. - self.facet_field_string_docids - .insert(key, (original_value, RoaringBitmap::from_iter(Some(id)))); - // If the word docids just reached it's capacity we must make sure to remove - // one element, this way next time we insert we doesn't grow the capacity. - if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { - // Removing the front element is equivalent to removing the LRU element. - Self::write_facet_field_string_docids( - &mut self.facet_field_strings_docids_sorter, - self.facet_field_string_docids.pop_front(), - )?; - } - } - } - - Ok(()) - } - - // Save the documents ids under the words pairs proximities that it contains. - fn insert_words_pairs_proximities_docids<'a>( - &mut self, - words_pairs_proximities: impl IntoIterator, - id: DocumentId, - ) -> Result<()> { - for ((w1, w2), prox) in words_pairs_proximities { - let w1 = SmallVec32::from(w1.as_bytes()); - let w2 = SmallVec32::from(w2.as_bytes()); - let key = (w1, w2, prox); - // if get_refresh finds the element it is assured - // to be at the end of the linked hash map. - match self.words_pairs_proximities_docids.get_refresh(&key) { - Some(old) => { - old.insert(id); - } - None => { - // A newly inserted element is append at the end of the linked hash map. - let ids = RoaringBitmap::from_iter(Some(id)); - self.words_pairs_proximities_docids.insert(key, ids); - } - } - } - - // If the linked hashmap is over capacity we must remove the overflowing elements. - let len = self.words_pairs_proximities_docids.len(); - let overflow = len.checked_sub(self.words_pairs_proximities_docids_limit); - if let Some(overflow) = overflow { - let mut lrus = Vec::with_capacity(overflow); - // Removing front elements is equivalent to removing the LRUs. - let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front()); - iter.take(overflow).for_each(|x| lrus.push(x)); - Self::write_words_pairs_proximities( - &mut self.words_pairs_proximities_docids_sorter, - lrus, - )?; - } - - Ok(()) - } - - fn write_document( - &mut self, - document_id: DocumentId, - words_positions: &mut HashMap>, - facet_numbers_values: &mut HashMap>, - facet_strings_values: &mut HashMap>, - record: &[u8], - ) -> Result<()> { - // We compute the list of words pairs proximities (self-join) and write it directly to disk. - let words_pair_proximities = compute_words_pair_proximities(&words_positions); - self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; - - // We store document_id associated with all the words the record contains. - for (word, _) in words_positions.iter() { - self.insert_word_docid(word, document_id)?; - } - - self.documents_writer.insert(document_id.to_be_bytes(), record)?; - Self::write_docid_word_positions( - &mut self.docid_word_positions_writer, - document_id, - words_positions, - )?; - Self::write_word_position_docids( - &mut self.word_level_position_docids_sorter, - document_id, - words_positions, - )?; - - words_positions.clear(); - - // We store document_id associated with all the facet numbers fields ids and values. - for (field, values) in facet_numbers_values.drain() { - for value in values { - let value = OrderedFloat::from(value); - self.insert_facet_number_values_docid(field, value, document_id)?; - } - } - - // We store document_id associated with all the facet strings fields ids and values. - for (field, values) in facet_strings_values.drain() { - for (normalized, original) in values { - self.insert_facet_string_values_docid(field, normalized, original, document_id)?; - } - } - - Ok(()) - } - - fn write_words_pairs_proximities( - sorter: &mut Sorter>, - iter: impl IntoIterator, SmallVec32, u8), RoaringBitmap)>, - ) -> Result<()> - where - Error: From, - { - let mut key = Vec::new(); - let mut buffer = Vec::new(); - - for ((w1, w2, min_prox), docids) in iter { - key.clear(); - key.extend_from_slice(w1.as_bytes()); - key.push(0); - key.extend_from_slice(w2.as_bytes()); - // Storing the minimun proximity found between those words - key.push(min_prox); - // We serialize the document ids into a buffer - buffer.clear(); - buffer.reserve(CboRoaringBitmapCodec::serialized_size(&docids)); - CboRoaringBitmapCodec::serialize_into(&docids, &mut buffer); - // that we write under the generated key into MTBL - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &buffer)?; - } else { - warn!( - "words pairs proximity ({:?} - {:?}, {:?}) is too large to be saved", - w1, w2, min_prox - ); - } - } - - Ok(()) - } - - fn write_docid_word_positions( - writer: &mut Writer, - id: DocumentId, - words_positions: &HashMap>, - ) -> Result<()> { - // We prefix the words by the document id. - let mut key = id.to_be_bytes().to_vec(); - let mut buffer = Vec::new(); - let base_size = key.len(); - - // We order the words lexicographically, this way we avoid passing by a sorter. - let words_positions = BTreeMap::from_iter(words_positions); - - for (word, positions) in words_positions { - key.truncate(base_size); - key.extend_from_slice(word.as_bytes()); - buffer.clear(); - - // We serialize the positions into a buffer. - let positions = RoaringBitmap::from_iter(positions.iter().cloned()); - BoRoaringBitmapCodec::serialize_into(&positions, &mut buffer); - - // that we write under the generated key into MTBL - if lmdb_key_valid_size(&key) { - writer.insert(&key, &buffer)?; - } else { - warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); - } - } - - Ok(()) - } - - fn write_word_position_docids( - writer: &mut Sorter>, - document_id: DocumentId, - words_positions: &HashMap>, - ) -> Result<()> - where - Error: From, - { - let mut key_buffer = Vec::new(); - let mut data_buffer = Vec::new(); - - for (word, positions) in words_positions { - key_buffer.clear(); - key_buffer.extend_from_slice(word.as_bytes()); - key_buffer.push(0); // level 0 - - for position in positions { - key_buffer.truncate(word.len() + 1); - let position_bytes = position.to_be_bytes(); - key_buffer.extend_from_slice(position_bytes.as_bytes()); - key_buffer.extend_from_slice(position_bytes.as_bytes()); - - data_buffer.clear(); - let positions = RoaringBitmap::from_iter(Some(document_id)); - // We serialize the positions into a buffer. - CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer); - - // that we write under the generated key into MTBL - if lmdb_key_valid_size(&key_buffer) { - writer.insert(&key_buffer, &data_buffer)?; - } else { - warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); - } - } - } - - Ok(()) - } - - fn write_facet_field_string_docids(sorter: &mut Sorter>, iter: I) -> Result<()> - where - I: IntoIterator, - Error: From, - { - let mut key_buffer = Vec::new(); - - for ((field_id, normalized_value), (original_value, docids)) in iter { - key_buffer.clear(); - - FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer); - - let data = (original_value.as_str(), docids); - let data = FacetStringLevelZeroValueCodec::::bytes_encode(&data) - .ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?; - - if lmdb_key_valid_size(&key_buffer) { - sorter.insert(&key_buffer, &data)?; - } else { - warn!( - "facet value {:?} is too large to be saved", - original_value.as_bytes().as_bstr() - ); - } - } - - Ok(()) - } - - fn write_facet_field_number_docids(sorter: &mut Sorter>, iter: I) -> Result<()> - where - I: IntoIterator), RoaringBitmap)>, - Error: From, - { - let mut data_buffer = Vec::new(); - - for ((field_id, value), docids) in iter { - data_buffer.clear(); - - let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value)) - .map(Cow::into_owned) - .ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?; - - CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); - - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &data_buffer)?; - } - } - - Ok(()) - } - - fn write_field_id_docid_facet_number_value( - sorter: &mut Sorter>, - field_id: FieldId, - document_id: DocumentId, - value: OrderedFloat, - ) -> Result<()> - where - Error: From, - { - let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value)) - .map(Cow::into_owned) - .ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?; - - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &[])?; - } - - Ok(()) - } - - fn write_field_id_docid_facet_string_value( - sorter: &mut Sorter>, - field_id: FieldId, - document_id: DocumentId, - normalized_value: &str, - original_value: &str, - ) -> Result<()> - where - Error: From, - { - let mut buffer = Vec::new(); - FieldDocIdFacetStringCodec::serialize_into( - field_id, - document_id, - normalized_value, - &mut buffer, - ); - - if lmdb_key_valid_size(&buffer) { - sorter.insert(&buffer, original_value.as_bytes())?; - } else { - warn!("facet value {:?} is too large to be saved", original_value.as_bytes().as_bstr()); - } - - Ok(()) - } - - fn write_word_docids(sorter: &mut Sorter>, iter: I) -> Result<()> - where - I: IntoIterator, RoaringBitmap)>, - Error: From, - { - let mut key = Vec::new(); - let mut buffer = Vec::new(); - - for (word, ids) in iter { - key.clear(); - key.extend_from_slice(&word); - // We serialize the document ids into a buffer - buffer.clear(); - let ids = RoaringBitmap::from_iter(ids); - buffer.reserve(ids.serialized_size()); - ids.serialize_into(&mut buffer)?; - // that we write under the generated key into MTBL - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &buffer)?; - } else { - warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); - } - } - - Ok(()) - } - - pub fn index( - mut self, - mut documents: grenad::Reader<&[u8]>, - documents_count: usize, - thread_index: usize, - num_threads: usize, - log_every_n: Option, - mut progress_callback: F, - ) -> Result - where - F: FnMut(UpdateIndexingStep), - { - debug!("{:?}: Indexing in a Store...", thread_index); - - let mut before = Instant::now(); - let mut words_positions = HashMap::new(); - let mut facet_numbers_values = HashMap::new(); - let mut facet_strings_values = HashMap::new(); - - let mut count: usize = 0; - while let Some((key, value)) = documents.next()? { - let document_id = key.try_into().map(u32::from_be_bytes).unwrap(); - let document = obkv::KvReader::new(value); - - // We skip documents that must not be indexed by this thread. - if count % num_threads == thread_index { - // This is a log routine that we do every `log_every_n` documents. - if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { - info!( - "We have seen {} documents so far ({:.02?}).", - format_count(count), - before.elapsed() - ); - progress_callback(UpdateIndexingStep::IndexDocuments { - documents_seen: count, - total_documents: documents_count, - }); - before = Instant::now(); - } - - for (attr, content) in document.iter() { - if self.filterable_fields.contains(&attr) - || self.searchable_fields.contains(&attr) - { - let value = - serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; - - if self.filterable_fields.contains(&attr) { - let (facet_numbers, facet_strings) = extract_facet_values(&value); - facet_numbers_values - .entry(attr) - .or_insert_with(Vec::new) - .extend(facet_numbers); - facet_strings_values - .entry(attr) - .or_insert_with(Vec::new) - .extend(facet_strings); - } - - if self.searchable_fields.contains(&attr) { - let content = match json_to_string(&value) { - Some(content) => content, - None => continue, - }; - - let analyzed = self.analyzer.analyze(&content); - let tokens = process_tokens(analyzed.tokens()); - - let mut last_pos = None; - for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { - last_pos = Some(pos); - let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions - .entry(token.text().to_string()) - .or_insert_with(SmallVec32::new) - .push(position); - } - - if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { - let key = (attr, last_pos as u8 + 1); - self.field_id_word_count_docids - .entry(key) - .or_insert_with(RoaringBitmap::new) - .insert(document_id); - } - } - } - } - - // We write the document in the documents store. - self.write_document( - document_id, - &mut words_positions, - &mut facet_numbers_values, - &mut facet_strings_values, - value, - )?; - } - - // Compute the document id of the next document. - count += 1; - } - - progress_callback(UpdateIndexingStep::IndexDocuments { - documents_seen: count, - total_documents: documents_count, - }); - - let readers = self.finish()?; - debug!("{:?}: Store created!", thread_index); - Ok(readers) - } - - fn finish(mut self) -> Result { - let comp_type = self.chunk_compression_type; - let comp_level = self.chunk_compression_level; - let shrink_size = self.chunk_fusing_shrink_size; - - Self::write_word_docids(&mut self.word_docids_sorter, self.word_docids)?; - Self::write_words_pairs_proximities( - &mut self.words_pairs_proximities_docids_sorter, - self.words_pairs_proximities_docids, - )?; - Self::write_facet_field_number_docids( - &mut self.facet_field_numbers_docids_sorter, - self.facet_field_number_docids, - )?; - - Self::write_facet_field_string_docids( - &mut self.facet_field_strings_docids_sorter, - self.facet_field_string_docids, - )?; - - let mut word_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - let mut builder = fst::SetBuilder::memory(); - - let mut iter = self.word_docids_sorter.into_iter()?; - while let Some((word, val)) = iter.next()? { - // This is a lexicographically ordered word position - // we use the key to construct the words fst. - builder.insert(word)?; - word_docids_wtr.insert(word, val)?; - } - - let mut docids_buffer = Vec::new(); - for ((fid, count), docids) in self.field_id_word_count_docids { - docids_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer); - let key: [u8; 3] = concat_arrays!(fid.to_be_bytes(), [count]); - self.field_id_word_count_docids_sorter.insert(key, &docids_buffer)?; - } - - let fst = builder.into_set(); - self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?; - - let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.main_sorter.write_into(&mut main_wtr)?; - - let mut words_pairs_proximities_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.words_pairs_proximities_docids_sorter - .write_into(&mut words_pairs_proximities_docids_wtr)?; - - let mut word_level_position_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; - - let mut field_id_word_count_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; - - let mut facet_field_numbers_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; - - let mut facet_field_strings_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; - - let mut field_id_docid_facet_numbers_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_docid_facet_numbers_sorter - .write_into(&mut field_id_docid_facet_numbers_wtr)?; - - let mut field_id_docid_facet_strings_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_docid_facet_strings_sorter - .write_into(&mut field_id_docid_facet_strings_wtr)?; - - let main = writer_into_reader(main_wtr, shrink_size)?; - let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; - let words_pairs_proximities_docids = - writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; - let word_level_position_docids = - writer_into_reader(word_level_position_docids_wtr, shrink_size)?; - let field_id_word_count_docids = - writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; - let facet_field_numbers_docids = - writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; - let facet_field_strings_docids = - writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; - let field_id_docid_facet_numbers = - writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; - let field_id_docid_facet_strings = - writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; - let docid_word_positions = - writer_into_reader(self.docid_word_positions_writer, shrink_size)?; - let documents = writer_into_reader(self.documents_writer, shrink_size)?; - - Ok(Readers { - main, - word_docids, - docid_word_positions, - words_pairs_proximities_docids, - word_level_position_docids, - field_id_word_count_docids, - facet_field_numbers_docids, - facet_field_strings_docids, - field_id_docid_facet_numbers, - field_id_docid_facet_strings, - documents, - }) - } -} - -/// Outputs a list of all pairs of words with the shortest proximity between 1 and 7 inclusive. -/// -/// This list is used by the engine to calculate the documents containing words that are -/// close to each other. -fn compute_words_pair_proximities( - word_positions: &HashMap>, -) -> HashMap<(&str, &str), u8> { - use itertools::Itertools; - - let mut words_pair_proximities = HashMap::new(); - for ((w1, ps1), (w2, ps2)) in word_positions.iter().cartesian_product(word_positions) { - let mut min_prox = None; - for (ps1, ps2) in ps1.iter().cartesian_product(ps2) { - let prox = crate::proximity::positions_proximity(*ps1, *ps2); - let prox = u8::try_from(prox).unwrap(); - // We don't care about a word that appear at the - // same position or too far from the other. - if prox >= 1 && prox <= 7 && min_prox.map_or(true, |mp| prox < mp) { - min_prox = Some(prox) - } - } - - if let Some(min_prox) = min_prox { - words_pair_proximities.insert((w1.as_str(), w2.as_str()), min_prox); - } - } - - words_pair_proximities -} - -fn format_count(n: usize) -> String { - human_format::Formatter::new().with_decimals(1).with_separator("").format(n as f64) -} - -fn lmdb_key_valid_size(key: &[u8]) -> bool { - !key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH -} - -/// take an iterator on tokens and compute their relative position depending on separator kinds -/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, -/// else we keep the standart proximity of 1 between words. -fn process_tokens<'a>( - tokens: impl Iterator>, -) -> impl Iterator)> { - tokens - .skip_while(|token| token.is_separator().is_some()) - .scan((0, None), |(offset, prev_kind), token| { - match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { - *offset += match *prev_kind { - Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, - Some(_) => 1, - None => 0, - }; - *prev_kind = Some(token.kind) - } - TokenKind::Separator(SeparatorKind::Hard) => { - *prev_kind = Some(token.kind); - } - TokenKind::Separator(SeparatorKind::Soft) - if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => - { - *prev_kind = Some(token.kind); - } - _ => (), - } - Some((*offset, token)) - }) - .filter(|(_, t)| t.is_word()) -} - -fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { - fn inner_extract_facet_values( - value: &Value, - can_recurse: bool, - output_numbers: &mut Vec, - output_strings: &mut Vec<(String, String)>, - ) { - match value { - Value::Null => (), - Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), - Value::Number(number) => { - if let Some(float) = number.as_f64() { - output_numbers.push(float); - } - } - Value::String(original) => { - let normalized = original.trim().to_lowercase(); - output_strings.push((normalized, original.clone())); - } - Value::Array(values) => { - if can_recurse { - for value in values { - inner_extract_facet_values(value, false, output_numbers, output_strings); - } - } - } - Value::Object(_) => (), - } - } - - let mut facet_number_values = Vec::new(); - let mut facet_string_values = Vec::new(); - inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); - - (facet_number_values, facet_string_values) -} diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index b273460d1..7bfaa6ecd 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -11,15 +11,14 @@ use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use super::merge_function::merge_two_obkvs; -use super::{create_sorter, create_writer, IndexDocumentsMethod}; -use crate::error::{Error, InternalError, UserError}; -use crate::index::db_name; -use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs}; -use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; -use crate::{ - ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, MergeFn, Result, BEU32, +use super::helpers::{ + create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn, }; +use super::IndexDocumentsMethod; +use crate::error::{InternalError, UserError}; +use crate::index::db_name; +use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32}; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; @@ -46,7 +45,6 @@ pub struct Transform<'t, 'i> { pub log_every_n: Option, pub chunk_compression_type: CompressionType, pub chunk_compression_level: Option, - pub chunk_fusing_shrink_size: Option, pub max_nb_chunks: Option, pub max_memory: Option, pub index_documents_method: IndexDocumentsMethod, @@ -149,7 +147,6 @@ impl Transform<'_, '_> { merge_function, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -169,7 +166,7 @@ impl Transform<'_, '_> { } obkv_buffer.clear(); - let mut writer = obkv::KvWriter::new(&mut obkv_buffer); + let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); // We prepare the fields ids map with the documents keys. for (key, _value) in &document { @@ -209,7 +206,6 @@ impl Transform<'_, '_> { .map_err(InternalError::SerdeJson)?; writer.insert(field_id, &json_buffer)?; } - // We validate the document id [a-zA-Z0-9\-_]. if field_id == primary_key_id && validate_document_id(&external_id).is_none() { return Err(UserError::InvalidDocumentId { @@ -291,7 +287,6 @@ impl Transform<'_, '_> { keep_latest_obkv, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -306,7 +301,7 @@ impl Transform<'_, '_> { let mut record = csv::StringRecord::new(); while csv.read_record(&mut record).map_err(UserError::Csv)? { obkv_buffer.clear(); - let mut writer = obkv::KvWriter::new(&mut obkv_buffer); + let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { @@ -372,9 +367,9 @@ impl Transform<'_, '_> { /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. - fn output_from_sorter( + fn output_from_sorter( self, - sorter: grenad::Sorter>, + sorter: grenad::Sorter, primary_key: String, fields_ids_map: FieldsIdsMap, approximate_number_of_documents: usize, @@ -383,7 +378,6 @@ impl Transform<'_, '_> { ) -> Result where F: Fn(UpdateIndexingStep) + Sync, - Error: From, { let documents_ids = self.index.documents_ids(self.rtxn)?; let mut field_distribution = self.index.field_distribution(self.rtxn)?; @@ -391,10 +385,15 @@ impl Transform<'_, '_> { // Once we have sort and deduplicated the documents we write them into a final file. let mut final_sorter = create_sorter( - |_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "documents" }), + |_id, obkvs| { + if obkvs.len() == 1 { + Ok(obkvs[0].clone()) + } else { + Err(InternalError::IndexingMergingKeys { process: "documents" }.into()) + } + }, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -405,7 +404,7 @@ impl Transform<'_, '_> { // While we write into final file we get or generate the internal documents ids. let mut documents_count = 0; - let mut iter = sorter.into_iter()?; + let mut iter = sorter.into_merger_iter()?; while let Some((external_id, update_obkv)) = iter.next()? { if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { @@ -534,7 +533,7 @@ impl Transform<'_, '_> { let docid = docid.get(); obkv_buffer.clear(); - let mut obkv_writer = obkv::KvWriter::new(&mut obkv_buffer); + let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. for (id, name) in new_fields_ids_map.iter() { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs new file mode 100644 index 000000000..e7617bdab --- /dev/null +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -0,0 +1,272 @@ +use std::fs::File; + +use heed::types::ByteSlice; +use heed::{BytesDecode, RwTxn}; +use roaring::RoaringBitmap; + +use super::helpers::{ + roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, +}; +use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; +use crate::update::index_documents::helpers::into_clonable_grenad; +use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Index, Result}; + +pub(crate) enum TypedChunk { + DocidWordPositions(grenad::Reader), + FieldIdDocidFacetStrings(grenad::Reader), + FieldIdDocidFacetNumbers(grenad::Reader), + Documents(grenad::Reader), + FieldIdWordcountDocids(grenad::Reader), + NewDocumentsIds(RoaringBitmap), + WordDocids(grenad::Reader), + WordLevelPositionDocids(grenad::Reader), + WordPairProximityDocids(grenad::Reader), + FieldIdFacetStringDocids(grenad::Reader), + FieldIdFacetNumberDocids(grenad::Reader), +} + +/// Write typed chunk in the corresponding LMDB database of the provided index. +/// Return new documents seen. +pub(crate) fn write_typed_chunk_into_index( + typed_chunk: TypedChunk, + index: &Index, + wtxn: &mut RwTxn, + index_is_empty: bool, +) -> Result { + match typed_chunk { + TypedChunk::DocidWordPositions(docid_word_positions_iter) => { + write_entries_into_database( + docid_word_positions_iter, + &index.docid_word_positions, + wtxn, + index_is_empty, + |value, buffer| { + // ensure that values are unique and ordered + let positions = roaring_bitmap_from_u32s_array(value); + BoRoaringBitmapCodec::serialize_into(&positions, buffer); + Ok(buffer) + }, + |new_values, db_values, buffer| { + let new_values = roaring_bitmap_from_u32s_array(new_values); + let positions = match BoRoaringBitmapCodec::bytes_decode(db_values) { + Some(db_values) => new_values | db_values, + None => new_values, // should not happen + }; + BoRoaringBitmapCodec::serialize_into(&positions, buffer); + Ok(()) + }, + )?; + } + TypedChunk::Documents(mut obkv_documents_iter) => { + while let Some((key, value)) = obkv_documents_iter.next()? { + index.documents.remap_types::().put(wtxn, key, value)?; + } + } + TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => { + append_entries_into_database( + fid_word_count_docids_iter, + &index.field_id_word_count_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + } + TypedChunk::NewDocumentsIds(documents_ids) => return Ok(documents_ids), + TypedChunk::WordDocids(word_docids_iter) => { + let mut word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?; + append_entries_into_database( + word_docids_iter.clone(), + &index.word_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_roaring_bitmaps, + )?; + + // create fst from word docids + let mut builder = fst::SetBuilder::memory(); + while let Some((word, _value)) = word_docids_iter.next()? { + // This is a lexicographically ordered word position + // we use the key to construct the words fst. + builder.insert(word)?; + } + let fst = builder.into_set().map_data(std::borrow::Cow::Owned).unwrap(); + let db_fst = index.words_fst(wtxn)?; + + // merge new fst with database fst + let union_stream = fst.op().add(db_fst.stream()).union(); + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(union_stream)?; + let fst = builder.into_set(); + index.put_words_fst(wtxn, &fst)?; + } + TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => { + append_entries_into_database( + word_level_position_docids_iter, + &index.word_level_position_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + } + TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { + append_entries_into_database( + facet_id_f64_docids_iter, + &index.facet_id_f64_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + } + TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { + append_entries_into_database( + word_pair_proximity_docids_iter, + &index.word_pair_proximity_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + } + TypedChunk::FieldIdDocidFacetNumbers(mut fid_docid_facet_number) => { + let index_fid_docid_facet_numbers = + index.field_id_docid_facet_f64s.remap_types::(); + while let Some((key, value)) = fid_docid_facet_number.next()? { + if valid_lmdb_key(key) { + index_fid_docid_facet_numbers.put(wtxn, key, &value)?; + } + } + } + TypedChunk::FieldIdDocidFacetStrings(mut fid_docid_facet_string) => { + let index_fid_docid_facet_strings = + index.field_id_docid_facet_strings.remap_types::(); + while let Some((key, value)) = fid_docid_facet_string.next()? { + if valid_lmdb_key(key) { + index_fid_docid_facet_strings.put(wtxn, key, &value)?; + } + } + } + TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { + append_entries_into_database( + facet_id_string_docids, + &index.facet_id_string_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + |new_values, db_values, buffer| { + let (_, new_values) = decode_prefix_string(new_values).unwrap(); + let new_values = RoaringBitmap::deserialize_from(new_values)?; + let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); + let db_values = RoaringBitmap::deserialize_from(db_values)?; + let values = new_values | db_values; + encode_prefix_string(db_original, buffer)?; + Ok(values.serialize_into(buffer)?) + }, + )?; + } + } + + Ok(RoaringBitmap::new()) +} + +fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { + let new_value = RoaringBitmap::deserialize_from(new_value)?; + let db_value = RoaringBitmap::deserialize_from(db_value)?; + let value = new_value | db_value; + Ok(serialize_roaring_bitmap(&value, buffer)?) +} + +fn merge_cbo_roaring_bitmaps( + new_value: &[u8], + db_value: &[u8], + buffer: &mut Vec, +) -> Result<()> { + let new_value = CboRoaringBitmapCodec::deserialize_from(new_value)?; + let db_value = CboRoaringBitmapCodec::deserialize_from(db_value)?; + let value = new_value | db_value; + Ok(CboRoaringBitmapCodec::serialize_into(&value, buffer)) +} + +/// Write provided entries in database using serialize_value function. +/// merge_values function is used if an entry already exist in the database. +fn write_entries_into_database( + mut data: grenad::Reader, + database: &heed::Database, + wtxn: &mut RwTxn, + index_is_empty: bool, + serialize_value: FS, + merge_values: FM, +) -> Result<()> +where + R: std::io::Read, + FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, + FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, +{ + let mut buffer = Vec::new(); + let database = database.remap_types::(); + + while let Some((key, value)) = data.next()? { + if valid_lmdb_key(key) { + buffer.clear(); + let value = if index_is_empty { + serialize_value(value, &mut buffer)? + } else { + match database.get(wtxn, key)? { + Some(prev_value) => { + merge_values(value, prev_value, &mut buffer)?; + &buffer[..] + } + None => serialize_value(value, &mut buffer)?, + } + }; + database.put(wtxn, key, value)?; + } + } + + Ok(()) +} + +/// Write provided entries in database using serialize_value function. +/// merge_values function is used if an entry already exist in the database. +/// All provided entries must be ordered. +/// If the index is not empty, write_entries_into_database is called instead. +fn append_entries_into_database( + mut data: grenad::Reader, + database: &heed::Database, + wtxn: &mut RwTxn, + index_is_empty: bool, + serialize_value: FS, + merge_values: FM, +) -> Result<()> +where + R: std::io::Read, + FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, + FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, +{ + if !index_is_empty { + return write_entries_into_database( + data, + database, + wtxn, + false, + serialize_value, + merge_values, + ); + } + + let mut buffer = Vec::new(); + let mut database = database.iter_mut(wtxn)?.remap_types::(); + + while let Some((key, value)) = data.next()? { + if valid_lmdb_key(key) { + buffer.clear(); + let value = serialize_value(value, &mut buffer)?; + unsafe { database.append(key, value)? }; + } + } + + Ok(()) +} diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1d0e15cff..ef23286ae 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -65,10 +65,8 @@ pub struct Settings<'a, 't, 'u, 'i> { pub(crate) log_every_n: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, - pub(crate) linked_hash_map_size: Option, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, update_id: u64, @@ -95,10 +93,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { log_every_n: None, max_nb_chunks: None, max_memory: None, - linked_hash_map_size: None, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, thread_pool: None, searchable_fields: Setting::NotSet, displayed_fields: Setting::NotSet, @@ -205,7 +201,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { log_every_n: self.log_every_n, chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, - chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, max_nb_chunks: self.max_nb_chunks, max_memory: self.max_memory, index_documents_method: IndexDocumentsMethod::ReplaceDocuments, @@ -232,10 +227,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.log_every_n = self.log_every_n; indexing_builder.max_nb_chunks = self.max_nb_chunks; indexing_builder.max_memory = self.max_memory; - indexing_builder.linked_hash_map_size = self.linked_hash_map_size; indexing_builder.chunk_compression_type = self.chunk_compression_type; indexing_builder.chunk_compression_level = self.chunk_compression_level; - indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; indexing_builder.thread_pool = self.thread_pool; indexing_builder.execute_raw(output, &cb)?; diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 2816ebca0..6035499b3 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -8,10 +8,8 @@ pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, - pub(crate) linked_hash_map_size: Option, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, pub(crate) update_id: u64, } @@ -22,10 +20,8 @@ impl<'a> UpdateBuilder<'a> { log_every_n: None, max_nb_chunks: None, max_memory: None, - linked_hash_map_size: None, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, thread_pool: None, update_id, } @@ -43,10 +39,6 @@ impl<'a> UpdateBuilder<'a> { self.max_memory = Some(max_memory); } - pub fn linked_hash_map_size(&mut self, linked_hash_map_size: usize) { - self.linked_hash_map_size = Some(linked_hash_map_size); - } - pub fn chunk_compression_type(&mut self, chunk_compression_type: CompressionType) { self.chunk_compression_type = chunk_compression_type; } @@ -55,10 +47,6 @@ impl<'a> UpdateBuilder<'a> { self.chunk_compression_level = Some(chunk_compression_level); } - pub fn chunk_fusing_shrink_size(&mut self, chunk_fusing_shrink_size: u64) { - self.chunk_fusing_shrink_size = Some(chunk_fusing_shrink_size); - } - pub fn thread_pool(&mut self, thread_pool: &'a ThreadPool) { self.thread_pool = Some(thread_pool); } @@ -89,10 +77,8 @@ impl<'a> UpdateBuilder<'a> { builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; - builder.linked_hash_map_size = self.linked_hash_map_size; builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; builder.thread_pool = self.thread_pool; builder @@ -108,10 +94,8 @@ impl<'a> UpdateBuilder<'a> { builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; - builder.linked_hash_map_size = self.linked_hash_map_size; builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; builder.thread_pool = self.thread_pool; builder @@ -126,7 +110,6 @@ impl<'a> UpdateBuilder<'a> { builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; builder } diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index ffc359719..b8a80938c 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -5,7 +5,7 @@ use grenad::CompressionType; use heed::types::ByteSlice; use crate::update::index_documents::{ - create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod, + create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, }; use crate::{Index, Result}; @@ -14,7 +14,6 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { index: &'i Index, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, } @@ -29,7 +28,6 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { index, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, max_nb_chunks: None, max_memory: None, } @@ -44,10 +42,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( - roaring_bitmap_merge, + merge_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -70,7 +67,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_docids.as_polymorph(), prefix_docids_sorter, - roaring_bitmap_merge, + merge_roaring_bitmaps, WriteMethod::Append, )?; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 9b876321e..8f04c23cf 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -9,7 +9,7 @@ use log::debug; use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::{ - cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, }; use crate::{Index, Result}; @@ -18,7 +18,6 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { index: &'i Index, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, } @@ -33,7 +32,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { index, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, max_nb_chunks: None, max_memory: None, } @@ -48,10 +46,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { // Here we create a sorter akin to the previous one. let mut word_prefix_pair_proximity_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, + merge_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -78,7 +75,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), word_prefix_pair_proximity_docids_sorter, - cbo_roaring_bitmap_merge, + merge_cbo_roaring_bitmaps, WriteMethod::Append, )?; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 2f0995c18..afd7d7736 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -4,7 +4,7 @@ use std::num::NonZeroU32; use std::{cmp, str}; use fst::Streamer; -use grenad::{CompressionType, FileFuse, Reader, Writer}; +use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::{BytesEncode, Error}; use log::debug; @@ -14,7 +14,7 @@ use crate::error::{InternalError, SerializationError}; use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ - cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database, + create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, WriteMethod, }; use crate::{Index, Result, TreeLevel}; @@ -24,7 +24,6 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { index: &'i Index, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, level_group_size: NonZeroU32, @@ -41,7 +40,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { index, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, max_nb_chunks: None, max_memory: None, level_group_size: NonZeroU32::new(4).unwrap(), @@ -68,7 +66,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.index.word_level_position_docids, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.level_group_size, self.min_level_size, )?; @@ -81,7 +78,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_level_position_docids.as_polymorph(), entries, - |_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" }), + |_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" })?, WriteMethod::Append, )?; @@ -89,10 +86,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.index.word_prefix_level_position_docids.clear(self.wtxn)?; let mut word_prefix_level_positions_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, + merge_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -131,7 +127,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_level_position_docids.as_polymorph(), word_prefix_level_positions_docids_sorter, - cbo_roaring_bitmap_merge, + merge_cbo_roaring_bitmaps, WriteMethod::Append, )?; @@ -141,7 +137,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.index.word_prefix_level_position_docids, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.level_group_size, self.min_level_size, )?; @@ -155,7 +150,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { *self.index.word_prefix_level_position_docids.as_polymorph(), entries, |_, _| { - Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }) + Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })? }, WriteMethod::Append, )?; @@ -185,10 +180,9 @@ fn compute_positions_levels( words_positions_db: heed::Database, compression_type: CompressionType, compression_level: Option, - shrink_size: Option, level_group_size: NonZeroU32, min_level_size: NonZeroU32, -) -> Result> { +) -> Result> { // It is forbidden to keep a cursor and write in a database at the same time with LMDB // therefore we write the facet levels entries into a grenad file before transfering them. let mut writer = tempfile::tempfile() @@ -254,7 +248,7 @@ fn compute_positions_levels( } } - writer_into_reader(writer, shrink_size) + writer_into_reader(writer) } fn write_level_entry( diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 7d4043ff1..a533a4cbe 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -5,7 +5,7 @@ use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::update::{IndexDocuments, Settings, UpdateFormat}; +use milli::update::{IndexDocuments, Settings, UpdateBuilder, UpdateFormat}; use milli::{AscDesc, Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -50,7 +50,9 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { builder.execute(|_, _| ()).unwrap(); // index documents - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = UpdateBuilder::new(0); + builder.max_memory(10 * 1024 * 1024); // 10MiB + let mut builder = builder.index_documents(&mut wtxn, &index); builder.update_format(UpdateFormat::JsonStream); builder.enable_autogenerate_docids(); builder.execute(CONTENT.as_bytes(), |_, _| ()).unwrap(); From 823da19745d4c4ada50873d7eddc4e0332c506bc Mon Sep 17 00:00:00 2001 From: many Date: Tue, 17 Aug 2021 10:56:06 +0200 Subject: [PATCH 03/15] Fix test and use progress callback --- .../src/update/index_documents/extract/mod.rs | 2 + milli/src/update/index_documents/mod.rs | 78 +++++++++++++++---- .../src/update/index_documents/typed_chunk.rs | 15 +++- 3 files changed, 79 insertions(+), 16 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index b24c80da4..a389f36cf 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -11,6 +11,7 @@ use std::collections::HashSet; use std::fs::File; use crossbeam_channel::Sender; +use log::debug; use rayon::prelude::*; use self::extract_docid_word_positions::extract_docid_word_positions; @@ -192,6 +193,7 @@ fn spawn_extraction_task( .map(|chunk| extract_fn(chunk, indexer.clone()).unwrap()) .collect(); rayon::spawn(move || { + debug!("merge {} database", name); let reader = merge_readers(chunks, merge_fn, indexer).unwrap(); lmdb_writer_sx.send(serialize_fn(reader)).unwrap(); }); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4f488337c..51b0a6613 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -31,6 +31,10 @@ use crate::update::{ }; use crate::{Index, Result}; +static MERGED_DATABASE_COUNT: usize = 7; +static PREFIX_DATABASE_COUNT: usize = 5; +static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct DocumentAdditionResult { pub nb_documents: usize, @@ -278,15 +282,34 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let index_is_empty = index_documents_ids.len() == 0; let mut final_documents_ids = RoaringBitmap::new(); + let mut databases_seen = 0; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + for typed_chunk in lmdb_writer_rx { - let docids = + let (docids, is_merged_database) = write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?; - final_documents_ids |= docids; - debug!( - "We have seen {} documents on {} total document so far", - final_documents_ids.len(), - documents_count - ); + if !docids.is_empty() { + final_documents_ids |= docids; + let documents_seen_count = final_documents_ids.len(); + progress_callback(UpdateIndexingStep::IndexDocuments { + documents_seen: documents_seen_count as usize, + total_documents: documents_count, + }); + debug!( + "We have seen {} documents on {} total document so far", + documents_seen_count, documents_count + ); + } + if is_merged_database { + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + } } // We write the field distribution into the main database @@ -298,20 +321,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the external documents ids into the main database. self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; - let all_documents_ids = index_documents_ids | new_documents_ids; + let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids; self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; self.execute_prefix_databases(progress_callback) } - pub fn execute_prefix_databases( - self, - // output: TransformOutput, - progress_callback: F, - ) -> Result<()> + pub fn execute_prefix_databases(self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, { + // Merged databases are already been indexed, we start from this count; + let mut databases_seen = MERGED_DATABASE_COUNT; + // Run the facets update operation. let mut builder = Facets::new(self.wtxn, self.index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; @@ -324,6 +346,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + // Run the words prefixes update operation. let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id); if let Some(value) = self.words_prefix_threshold { @@ -334,6 +362,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + // Run the word prefix docids update operation. let mut builder = WordPrefixDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; @@ -342,6 +376,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { builder.max_memory = self.max_memory; builder.execute()?; + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + // Run the word prefix pair proximity docids update operation. let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; @@ -350,6 +390,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { builder.max_memory = self.max_memory; builder.execute()?; + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + // Run the words level positions update operation. let mut builder = WordsLevelPositions::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; @@ -362,6 +408,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + Ok(()) } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e7617bdab..e8790af16 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -32,7 +32,8 @@ pub(crate) fn write_typed_chunk_into_index( index: &Index, wtxn: &mut RwTxn, index_is_empty: bool, -) -> Result { +) -> Result<(RoaringBitmap, bool)> { + let mut is_merged_database = false; match typed_chunk { TypedChunk::DocidWordPositions(docid_word_positions_iter) => { write_entries_into_database( @@ -71,8 +72,11 @@ pub(crate) fn write_typed_chunk_into_index( |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; + is_merged_database = true; + } + TypedChunk::NewDocumentsIds(documents_ids) => { + return Ok((documents_ids, is_merged_database)) } - TypedChunk::NewDocumentsIds(documents_ids) => return Ok(documents_ids), TypedChunk::WordDocids(word_docids_iter) => { let mut word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?; append_entries_into_database( @@ -100,6 +104,7 @@ pub(crate) fn write_typed_chunk_into_index( builder.extend_stream(union_stream)?; let fst = builder.into_set(); index.put_words_fst(wtxn, &fst)?; + is_merged_database = true; } TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => { append_entries_into_database( @@ -110,6 +115,7 @@ pub(crate) fn write_typed_chunk_into_index( |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; + is_merged_database = true; } TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { append_entries_into_database( @@ -120,6 +126,7 @@ pub(crate) fn write_typed_chunk_into_index( |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; + is_merged_database = true; } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { append_entries_into_database( @@ -130,6 +137,7 @@ pub(crate) fn write_typed_chunk_into_index( |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; + is_merged_database = true; } TypedChunk::FieldIdDocidFacetNumbers(mut fid_docid_facet_number) => { let index_fid_docid_facet_numbers = @@ -166,10 +174,11 @@ pub(crate) fn write_typed_chunk_into_index( Ok(values.serialize_into(buffer)?) }, )?; + is_merged_database = true; } } - Ok(RoaringBitmap::new()) + Ok((RoaringBitmap::new(), is_merged_database)) } fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { From 2d1727697dbc802822a5b427f11cdfd1aada768c Mon Sep 17 00:00:00 2001 From: many Date: Tue, 17 Aug 2021 12:25:07 +0200 Subject: [PATCH 04/15] Take stop word in account --- .../index_documents/extract/extract_docid_word_positions.rs | 5 +++++ milli/src/update/index_documents/extract/mod.rs | 2 ++ milli/src/update/index_documents/mod.rs | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 9a9d7cb85..3ee7ee3b3 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -21,6 +21,7 @@ pub fn extract_docid_word_positions( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, searchable_fields: &Option>, + stop_words: Option<&fst::Set<&[u8]>>, ) -> Result<(RoaringBitmap, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); @@ -35,6 +36,10 @@ pub fn extract_docid_word_positions( let mut key_buffer = Vec::new(); let mut field_buffer = String::new(); + let mut config = AnalyzerConfig::default(); + if let Some(stop_words) = stop_words { + config.stop_words(stop_words); + } let analyzer = Analyzer::>::new(AnalyzerConfig::default()); while let Some((key, value)) = obkv_documents.next()? { diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index a389f36cf..00c0a4a5f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -37,6 +37,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx: Sender, searchable_fields: Option>, faceted_fields: HashSet, + stop_words: Option>, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() @@ -54,6 +55,7 @@ pub(crate) fn data_from_obkv_documents( documents_chunk.clone(), indexer.clone(), &searchable_fields, + stop_words.as_ref(), )?; // send documents_ids to DB writer diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 51b0a6613..c9f5da0c1 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -231,6 +231,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // get filterable fields for facet databases let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + let stop_words = self.index.stop_words(self.wtxn)?; + // let stop_words = stop_words.as_ref(); + // Run extraction pipeline in parallel. pool.install(|| { let params = GrenadParameters { @@ -255,6 +258,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { lmdb_writer_sx, searchable_fields, faceted_fields, + stop_words, ) .unwrap(); }); From 5c962c03dd5ba2027f408292e64ea31403eb5e38 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 18 Aug 2021 18:04:24 +0200 Subject: [PATCH 05/15] Fix and optimize word_prefix_pair_proximity_docids database --- infos/src/main.rs | 64 +++++++++++ milli/src/search/criteria/exactness.rs | 4 - milli/src/search/criteria/mod.rs | 43 ++++--- milli/src/update/index_documents/mod.rs | 4 +- .../word_prefix_pair_proximity_docids.rs | 108 ++++++++++++++---- milli/src/update/words_prefixes_fst.rs | 18 ++- 6 files changed, 187 insertions(+), 54 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index da15251b0..bb09d7234 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -207,6 +207,24 @@ enum Command { word2: String, }, + /// Outputs a CSV with the proximities for the two specified words and + /// the documents ids where these relations appears. + /// + /// `word1`, `prefix` defines the word pair specified *in this specific order*. + /// `proximity` defines the proximity between the two specified words. + /// `documents_ids` defines the documents ids where the relation appears. + WordPrefixPairProximitiesDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// First word of the word pair. + word1: String, + + /// Second word of the word pair. + prefix: String, + }, + /// Outputs the words FST to standard output. /// /// One can use the FST binary helper to dissect and analyze it, @@ -282,6 +300,9 @@ fn main() -> anyhow::Result<()> { WordPairProximitiesDocids { full_display, word1, word2 } => { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) } + WordPrefixPairProximitiesDocids { full_display, word1, prefix } => { + word_prefix_pair_proximities_docids(&index, &rtxn, !full_display, word1, prefix) + } ExportWordsFst => export_words_fst(&index, &rtxn), ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), ExportDocuments { internal_documents_ids } => { @@ -1131,3 +1152,46 @@ fn word_pair_proximities_docids( Ok(wtr.flush()?) } + +fn word_prefix_pair_proximities_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + word1: String, + word_prefix: String, +) -> anyhow::Result<()> { + use heed::types::ByteSlice; + use milli::RoaringBitmapCodec; + + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["word1", "word_prefix", "proximity", "documents_ids"])?; + + // Create the prefix key with only the pair of words. + let mut prefix = Vec::with_capacity(word1.len() + word_prefix.len() + 1); + prefix.extend_from_slice(word1.as_bytes()); + prefix.push(0); + prefix.extend_from_slice(word_prefix.as_bytes()); + + let db = index.word_prefix_pair_proximity_docids.as_polymorph(); + let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?; + for result in iter { + let (key, docids) = result?; + + // Skip keys that are longer than the requested one, + // a longer key means that the second word is a prefix of the request word. + if key.len() != prefix.len() + 1 { + continue; + } + + let proximity = key.last().unwrap(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[&word1, &word_prefix, &proximity.to_string(), &docids])?; + } + + Ok(wtr.flush()?) +} diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 22dcb9782..1e4d4e7a2 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -180,10 +180,6 @@ fn resolve_state( if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { - println!( - "found candidates that have the good count: {:?}", - attribute_allowed_docids - ); let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; attribute_candidates_array.push(attribute_allowed_docids); diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 61b0fe049..2a883de67 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -461,13 +461,18 @@ fn query_pair_proximity_docids( let prefix = right.prefix; match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { - if prefix && ctx.in_prefix_cache(&right) { - Ok(ctx - .word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? - .unwrap_or_default()) - } else if prefix { - let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + if prefix { + match ctx.word_prefix_pair_proximity_docids( + left.as_str(), + right.as_str(), + proximity, + )? { + Some(docids) => Ok(docids), + None => { + let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + } + } } else { Ok(ctx .word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? @@ -477,22 +482,24 @@ fn query_pair_proximity_docids( (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); - if prefix && ctx.in_prefix_cache(&right) { + if prefix { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { - let current_docids = ctx - .word_prefix_pair_proximity_docids( - left.as_ref(), - right.as_ref(), - proximity, - )? - .unwrap_or_default(); + let current_docids = match ctx.word_prefix_pair_proximity_docids( + left.as_str(), + right.as_str(), + proximity, + )? { + Some(docids) => Ok(docids), + None => { + let r_words = + word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + } + }?; docids |= current_docids; } Ok(docids) - } else if prefix { - let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c9f5da0c1..b7fa1492c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -21,7 +21,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; pub use self::helpers::{ create_sorter, create_writer, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, + sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; @@ -81,7 +81,7 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { pub(crate) thread_pool: Option<&'a ThreadPool>, facet_level_group_size: Option, facet_min_level_size: Option, - words_prefix_threshold: Option, + words_prefix_threshold: Option, max_prefix_length: Option, words_positions_level_group_size: Option, words_positions_min_level_size: Option, diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 8f04c23cf..cabe1053b 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,15 +1,13 @@ -use std::str; +use std::collections::HashMap; -use fst::automaton::{Automaton, Str}; -use fst::{IntoStreamer, Streamer}; +use fst::IntoStreamer; use grenad::CompressionType; use heed::types::ByteSlice; -use heed::BytesEncode; use log::debug; +use slice_group_by::GroupBy; -use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, MergeFn, WriteMethod, }; use crate::{Index, Result}; @@ -20,6 +18,7 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, + threshold: u32, } impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { @@ -34,16 +33,26 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { chunk_compression_level: None, max_nb_chunks: None, max_memory: None, + threshold: 100, } } + /// Set the number of words required to make a prefix be part of the words prefixes + /// database. If a word prefix is supposed to match more than this number of words in the + /// dictionnary, therefore this prefix is added to the words prefixes datastructures. + /// + /// Default value is 100. This value must be higher than 50 and will be clamped + /// to these bound otherwise. + pub fn threshold(&mut self, value: u32) -> &mut Self { + self.threshold = value.max(50); + self + } + pub fn execute(self) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; - let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - // Here we create a sorter akin to the previous one. let mut word_prefix_pair_proximity_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, @@ -53,22 +62,59 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self.max_memory, ); - // We insert all the word pairs corresponding to the word-prefix pairs - // where the prefixes appears in the prefix FST previously constructed. - let db = self.index.word_pair_proximity_docids.remap_data_type::(); - for result in db.iter(self.wtxn)? { - let ((word1, word2, prox), data) = result?; - let automaton = Str::new(word2).starts_with(); - let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); - while let Some(prefix) = matching_prefixes.next() { - let prefix = str::from_utf8(prefix)?; - let pair = (word1, prefix, prox); - let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); - word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + let prefix_fst_keys = prefix_fst.into_stream().into_bytes(); + let prefix_fst_keys: Vec<_> = prefix_fst_keys + .as_slice() + .linear_group_by_key(|x| std::str::from_utf8(&x).unwrap().chars().nth(0).unwrap()) + .collect(); + + let mut db = + self.index.word_pair_proximity_docids.remap_data_type::().iter(self.wtxn)?; + + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[Vec]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some(((w1, w2, prox), data)) = db.next().transpose()? { + current_prefixes = match current_prefixes.take() { + Some(prefixes) if w2.as_bytes().starts_with(&prefixes[0]) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter( + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + self.threshold, + )?; + prefix_fst_keys.iter().find(|prefixes| w2.as_bytes().starts_with(&prefixes[0])) + } + }; + + if let Some(prefixes) = current_prefixes { + buffer.clear(); + buffer.extend_from_slice(w1.as_bytes()); + buffer.push(0); + for prefix in prefixes.iter().filter(|prefix| w2.as_bytes().starts_with(prefix)) { + buffer.truncate(w1.len() + 1); + buffer.extend_from_slice(prefix); + buffer.push(prox); + + match prefixes_cache.get_mut(&buffer) { + Some(value) => value.push(data), + None => { + prefixes_cache.insert(buffer.clone(), vec![data]); + } + } + } } } + write_prefixes_in_sorter( + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + self.threshold, + )?; + drop(prefix_fst); + drop(db); // We finally write the word prefix pair proximity docids into the LMDB database. sorter_into_lmdb_database( @@ -82,3 +128,25 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { Ok(()) } } + +fn write_prefixes_in_sorter( + prefixes: &mut HashMap, Vec<&[u8]>>, + sorter: &mut grenad::Sorter, + min_word_per_prefix: u32, +) -> Result<()> { + for (i, (key, data_slices)) in prefixes.drain().enumerate() { + // if the number of words prefixed by the prefix is higher than the threshold, + // we insert it in the sorter. + if data_slices.len() > min_word_per_prefix as usize { + for data in data_slices { + sorter.insert(&key, data)?; + } + // if the first prefix isn't elligible for insertion, + // then the other prefixes can't be elligible. + } else if i == 0 { + break; + } + } + + Ok(()) +} diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index f35dea10d..be33c156b 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -8,7 +8,7 @@ use crate::{Index, Result, SmallString32}; pub struct WordsPrefixesFst<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - threshold: f64, + threshold: u32, max_prefix_length: usize, _update_id: u64, } @@ -22,20 +22,20 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { WordsPrefixesFst { wtxn, index, - threshold: 0.1 / 100.0, // .01% + threshold: 100, max_prefix_length: 4, _update_id: update_id, } } - /// Set the ratio of concerned words required to make a prefix be part of the words prefixes + /// Set the number of words required to make a prefix be part of the words prefixes /// database. If a word prefix is supposed to match more than this number of words in the /// dictionnary, therefore this prefix is added to the words prefixes datastructures. /// - /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped - /// to these bounds otherwise. - pub fn threshold(&mut self, value: f64) -> &mut Self { - self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] + /// Default value is 100. This value must be higher than 50 and will be clamped + /// to this bound otherwise. + pub fn threshold(&mut self, value: u32) -> &mut Self { + self.threshold = value.max(50); self } @@ -50,8 +50,6 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { pub fn execute(self) -> Result<()> { let words_fst = self.index.words_fst(&self.wtxn)?; - let number_of_words = words_fst.len(); - let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); for n in 1..=self.max_prefix_length { @@ -80,7 +78,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { current_prefix_count += 1; // There is enough words corresponding to this prefix to add it to the cache. - if current_prefix_count == min_number_of_words { + if current_prefix_count >= self.threshold { builder.insert(prefix)?; } } From a2f59a28f7685882df7b96ffbb1527596f8a0823 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 24 Aug 2021 13:01:31 +0200 Subject: [PATCH 06/15] Remove unwrap sending errors in channel --- .../src/update/index_documents/extract/mod.rs | 162 ++++++++++-------- milli/src/update/index_documents/mod.rs | 42 +++-- 2 files changed, 120 insertions(+), 84 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 00c0a4a5f..591c8d4cd 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -34,7 +34,7 @@ use crate::{FieldId, Result}; pub(crate) fn data_from_obkv_documents( obkv_chunks: impl Iterator>> + Send, indexer: GrenadParameters, - lmdb_writer_sx: Sender, + lmdb_writer_sx: Sender>, searchable_fields: Option>, faceted_fields: HashSet, stop_words: Option>, @@ -42,63 +42,14 @@ pub(crate) fn data_from_obkv_documents( let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() .map(|result| { - let documents_chunk = result.and_then(|c| unsafe { into_clonable_grenad(c) }).unwrap(); - - lmdb_writer_sx.send(TypedChunk::Documents(documents_chunk.clone())).unwrap(); - - let (docid_word_positions_chunk, docid_fid_facet_values_chunks): ( - Result<_>, - Result<_>, - ) = rayon::join( - || { - let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( - documents_chunk.clone(), - indexer.clone(), - &searchable_fields, - stop_words.as_ref(), - )?; - - // send documents_ids to DB writer - lmdb_writer_sx.send(TypedChunk::NewDocumentsIds(documents_ids)).unwrap(); - - // send docid_word_positions_chunk to DB writer - let docid_word_positions_chunk = - unsafe { into_clonable_grenad(docid_word_positions_chunk)? }; - lmdb_writer_sx - .send(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())) - .unwrap(); - Ok(docid_word_positions_chunk) - }, - || { - let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = - extract_fid_docid_facet_values( - documents_chunk.clone(), - indexer.clone(), - &faceted_fields, - )?; - - // send docid_fid_facet_numbers_chunk to DB writer - let docid_fid_facet_numbers_chunk = - unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? }; - lmdb_writer_sx - .send(TypedChunk::FieldIdDocidFacetNumbers( - docid_fid_facet_numbers_chunk.clone(), - )) - .unwrap(); - - // send docid_fid_facet_strings_chunk to DB writer - let docid_fid_facet_strings_chunk = - unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? }; - lmdb_writer_sx - .send(TypedChunk::FieldIdDocidFacetStrings( - docid_fid_facet_strings_chunk.clone(), - )) - .unwrap(); - - Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) - }, - ); - Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) + extract_documents_data( + result, + indexer, + lmdb_writer_sx.clone(), + &searchable_fields, + &faceted_fields, + &stop_words, + ) }) .collect(); @@ -177,7 +128,7 @@ pub(crate) fn data_from_obkv_documents( fn spawn_extraction_task( chunks: Vec>, indexer: GrenadParameters, - lmdb_writer_sx: Sender, + lmdb_writer_sx: Sender>, extract_fn: FE, merge_fn: MergeFn, serialize_fn: FS, @@ -190,14 +141,89 @@ fn spawn_extraction_task( FS: Fn(grenad::Reader) -> TypedChunk + Sync + Send + 'static, { rayon::spawn(move || { - let chunks: Vec<_> = chunks - .into_par_iter() - .map(|chunk| extract_fn(chunk, indexer.clone()).unwrap()) - .collect(); - rayon::spawn(move || { - debug!("merge {} database", name); - let reader = merge_readers(chunks, merge_fn, indexer).unwrap(); - lmdb_writer_sx.send(serialize_fn(reader)).unwrap(); - }); + let chunks: Result> = + chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect(); + rayon::spawn(move || match chunks { + Ok(chunks) => { + debug!("merge {} database", name); + let reader = merge_readers(chunks, merge_fn, indexer); + lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))).unwrap(); + } + Err(e) => lmdb_writer_sx.send(Err(e)).unwrap(), + }) }); } + +/// Extract chuncked data and send it into lmdb_writer_sx sender: +/// - documents +/// - documents_ids +/// - docid_word_positions +/// - docid_fid_facet_numbers +/// - docid_fid_facet_strings +fn extract_documents_data( + documents_chunk: Result>, + indexer: GrenadParameters, + lmdb_writer_sx: Sender>, + searchable_fields: &Option>, + faceted_fields: &HashSet, + stop_words: &Option>, +) -> Result<( + grenad::Reader, + (grenad::Reader, grenad::Reader), +)> { + let documents_chunk = documents_chunk.and_then(|c| unsafe { into_clonable_grenad(c) })?; + + lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))).unwrap(); + + let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = + rayon::join( + || { + let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( + documents_chunk.clone(), + indexer.clone(), + searchable_fields, + stop_words.as_ref(), + )?; + + // send documents_ids to DB writer + lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))).unwrap(); + + // send docid_word_positions_chunk to DB writer + let docid_word_positions_chunk = + unsafe { into_clonable_grenad(docid_word_positions_chunk)? }; + lmdb_writer_sx + .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))) + .unwrap(); + Ok(docid_word_positions_chunk) + }, + || { + let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = + extract_fid_docid_facet_values( + documents_chunk.clone(), + indexer.clone(), + faceted_fields, + )?; + + // send docid_fid_facet_numbers_chunk to DB writer + let docid_fid_facet_numbers_chunk = + unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? }; + lmdb_writer_sx + .send(Ok(TypedChunk::FieldIdDocidFacetNumbers( + docid_fid_facet_numbers_chunk.clone(), + ))) + .unwrap(); + + // send docid_fid_facet_strings_chunk to DB writer + let docid_fid_facet_strings_chunk = + unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? }; + lmdb_writer_sx + .send(Ok(TypedChunk::FieldIdDocidFacetStrings( + docid_fid_facet_strings_chunk.clone(), + ))) + .unwrap(); + + Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) + }, + ); + Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) +} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b7fa1492c..4cf7c83f1 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -222,8 +222,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let documents_file = grenad::Reader::new(documents_file)?; // create LMDB writer channel - let (lmdb_writer_sx, lmdb_writer_rx): (Sender, Receiver) = - crossbeam_channel::unbounded(); + let (lmdb_writer_sx, lmdb_writer_rx): ( + Sender>, + Receiver>, + ) = crossbeam_channel::unbounded(); // get searchable fields for word databases let searchable_fields = @@ -244,23 +246,31 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }; // split obkv file into several chuncks - let mut chunk_iter = grenad_obkv_into_chunks( + let chunk_iter = grenad_obkv_into_chunks( documents_file, params.clone(), self.log_every_n, Byte::from_bytes(self.documents_chunk_size.unwrap_or(1024 * 1024 * 128) as u64), // 128MiB - ) - .unwrap(); - // extract all databases from the chunked obkv douments - extract::data_from_obkv_documents( - &mut chunk_iter, - params, - lmdb_writer_sx, - searchable_fields, - faceted_fields, - stop_words, - ) - .unwrap(); + ); + + let result = chunk_iter.map(|chunk_iter| { + // extract all databases from the chunked obkv douments + extract::data_from_obkv_documents( + chunk_iter, + params, + lmdb_writer_sx.clone(), + searchable_fields, + faceted_fields, + stop_words, + ) + }); + + if let Err(e) = result { + lmdb_writer_sx.send(Err(e)).unwrap(); + } + + // needs to be droped to avoid channel waiting lock. + drop(lmdb_writer_sx) }); // We delete the documents that this document addition replaces. This way we are @@ -294,7 +304,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { for typed_chunk in lmdb_writer_rx { let (docids, is_merged_database) = - write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?; + write_typed_chunk_into_index(typed_chunk?, &self.index, self.wtxn, index_is_empty)?; if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); From fc7cc770d408c223e3c1efc058c21ea122851522 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 24 Aug 2021 13:55:53 +0200 Subject: [PATCH 07/15] Add logging timers --- milli/src/update/facets.rs | 1 + .../index_documents/extract/extract_docid_word_positions.rs | 1 + .../index_documents/extract/extract_facet_number_docids.rs | 1 + .../index_documents/extract/extract_facet_string_docids.rs | 1 + .../index_documents/extract/extract_fid_docid_facet_values.rs | 1 + .../index_documents/extract/extract_fid_word_count_docids.rs | 1 + .../src/update/index_documents/extract/extract_word_docids.rs | 1 + .../extract/extract_word_level_position_docids.rs | 1 + .../extract/extract_word_pair_proximity_docids.rs | 1 + milli/src/update/index_documents/mod.rs | 3 +++ milli/src/update/word_prefix_docids.rs | 1 + milli/src/update/word_prefix_pair_proximity_docids.rs | 1 + milli/src/update/words_level_positions.rs | 1 + milli/src/update/words_prefixes_fst.rs | 1 + 14 files changed, 16 insertions(+) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 3ae63f282..9b7d6d42c 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -57,6 +57,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self } + #[logging_timer::time("Facets::{}")] pub fn execute(self) -> Result<()> { self.index.set_updated_at(self.wtxn, &Utc::now())?; // We get the faceted fields to be able to create the facet levels. diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 3ee7ee3b3..fb3372660 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -17,6 +17,7 @@ use crate::{FieldId, Result}; /// /// Returns the generated internal documents ids and a grenad reader /// with the list of extracted words from the given chunk of documents. +#[logging_timer::time] pub fn extract_docid_word_positions( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 1734ef028..5480bd605 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -13,6 +13,7 @@ use crate::Result; /// /// Returns a grenad reader with the list of extracted facet numbers and /// documents ids from the given chunk of docid facet number positions. +#[logging_timer::time] pub fn extract_facet_number_docids( mut docid_fid_facet_number: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 66ede5f42..e08d062cf 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -15,6 +15,7 @@ use crate::{FieldId, Result}; /// /// Returns a grenad reader with the list of extracted facet strings and /// documents ids from the given chunk of docid facet string positions. +#[logging_timer::time] pub fn extract_facet_string_docids( mut docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index e7e56a3c8..08f2cadf0 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -15,6 +15,7 @@ use crate::{DocumentId, FieldId, Result}; /// /// Returns the generated grenad reader containing the docid the fid and the orginal value as key /// and the normalized value as value extracted from the given chunk of documents. +#[logging_timer::time] pub fn extract_fid_docid_facet_values( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 66b179663..cf698507d 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -16,6 +16,7 @@ use crate::{DocumentId, FieldId, Result}; /// /// Returns a grenad reader with the list of extracted field id word counts /// and documents ids from the given chunk of docid word positions. +#[logging_timer::time] pub fn extract_fid_word_count_docids( mut docid_word_positions: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 85453e173..8ca8e39eb 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -14,6 +14,7 @@ use crate::Result; /// /// Returns a grenad reader with the list of extracted words and /// documents ids from the given chunk of docid word positions. +#[logging_timer::time] pub fn extract_word_docids( mut docid_word_positions: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs index c7138b32a..e099b0b49 100644 --- a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs @@ -10,6 +10,7 @@ use crate::{DocumentId, Result}; /// /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. +#[logging_timer::time] pub fn extract_word_level_position_docids( mut docid_word_positions: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 2bc79aac5..96bd965d8 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -17,6 +17,7 @@ use crate::{DocumentId, Result}; /// /// Returns a grenad reader with the list of extracted word pairs proximities and /// documents ids from the given chunk of docid word positions. +#[logging_timer::time] pub fn extract_word_pair_proximity_docids( mut docid_word_positions: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4cf7c83f1..d6fbd3e93 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -136,6 +136,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.autogenerate_docids = false; } + #[logging_timer::time("IndexDocuments::{}")] pub fn execute(self, reader: R, progress_callback: F) -> Result where R: io::Read, @@ -181,6 +182,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { Ok(DocumentAdditionResult { nb_documents }) } + #[logging_timer::time("IndexDocuments::{}")] pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -341,6 +343,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.execute_prefix_databases(progress_callback) } + #[logging_timer::time("IndexDocuments::{}")] pub fn execute_prefix_databases(self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index b8a80938c..30dabf1ae 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -33,6 +33,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } } + #[logging_timer::time("WordPrefixDocids::{}")] pub fn execute(self) -> Result<()> { // Clear the word prefix docids database. self.index.word_prefix_docids.clear(self.wtxn)?; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index cabe1053b..eb098a91f 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -48,6 +48,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute(self) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index afd7d7736..0af51fbb2 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -57,6 +57,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self } + #[logging_timer::time("WordsLevelPositions::{}")] pub fn execute(self) -> Result<()> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index be33c156b..eaaacc26f 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -48,6 +48,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { self } + #[logging_timer::time("WordsPrefixesFst::{}")] pub fn execute(self) -> Result<()> { let words_fst = self.index.words_fst(&self.wtxn)?; From e09eec37bc4fc6529129f24fd45c7c6d28ec2297 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 25 Aug 2021 15:09:46 +0200 Subject: [PATCH 08/15] Handle distance addition with hard separators --- .../extract/extract_docid_word_positions.rs | 43 ++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index fb3372660..894a193bf 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -3,7 +3,8 @@ use std::convert::TryInto; use std::fs::File; use std::{io, mem, str}; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; +use meilisearch_tokenizer::token::SeparatorKind; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; use roaring::RoaringBitmap; use serde_json::Value; @@ -61,11 +62,8 @@ pub fn extract_docid_word_positions( field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut field_buffer) { let analyzed = analyzer.analyze(field); - let tokens = analyzed - .tokens() - .filter(Token::is_word) - .enumerate() - .take_while(|(i, _)| (*i as u32) < ONE_ATTRIBUTE); + let tokens = process_tokens(analyzed.tokens()) + .take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE); for (index, token) in tokens { let token = token.text().trim(); @@ -134,3 +132,36 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st None } } + +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standart proximity of 1 between words. +fn process_tokens<'a>( + tokens: impl Iterator>, +) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator().is_some()) + .scan((0, None), |(offset, prev_kind), token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => + { + *prev_kind = Some(token.kind); + } + _ => (), + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} From 8f702828ca23bed257def6d452705e707e5c3e1c Mon Sep 17 00:00:00 2001 From: many Date: Thu, 26 Aug 2021 11:01:30 +0200 Subject: [PATCH 09/15] Ignore errors comming from crossbeam channel senders --- .../src/update/index_documents/extract/mod.rs | 34 +++++++++---------- milli/src/update/index_documents/mod.rs | 2 +- .../src/update/index_documents/typed_chunk.rs | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 591c8d4cd..04c57b0fa 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -147,9 +147,11 @@ fn spawn_extraction_task( Ok(chunks) => { debug!("merge {} database", name); let reader = merge_readers(chunks, merge_fn, indexer); - lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))).unwrap(); + let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); } - Err(e) => lmdb_writer_sx.send(Err(e)).unwrap(), }) }); } @@ -173,7 +175,7 @@ fn extract_documents_data( )> { let documents_chunk = documents_chunk.and_then(|c| unsafe { into_clonable_grenad(c) })?; - lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))).unwrap(); + let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( @@ -186,14 +188,14 @@ fn extract_documents_data( )?; // send documents_ids to DB writer - lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))).unwrap(); + let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); // send docid_word_positions_chunk to DB writer let docid_word_positions_chunk = unsafe { into_clonable_grenad(docid_word_positions_chunk)? }; - lmdb_writer_sx - .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))) - .unwrap(); + let _ = lmdb_writer_sx + .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); + Ok(docid_word_positions_chunk) }, || { @@ -207,20 +209,18 @@ fn extract_documents_data( // send docid_fid_facet_numbers_chunk to DB writer let docid_fid_facet_numbers_chunk = unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? }; - lmdb_writer_sx - .send(Ok(TypedChunk::FieldIdDocidFacetNumbers( - docid_fid_facet_numbers_chunk.clone(), - ))) - .unwrap(); + + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( + docid_fid_facet_numbers_chunk.clone(), + ))); // send docid_fid_facet_strings_chunk to DB writer let docid_fid_facet_strings_chunk = unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? }; - lmdb_writer_sx - .send(Ok(TypedChunk::FieldIdDocidFacetStrings( - docid_fid_facet_strings_chunk.clone(), - ))) - .unwrap(); + + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( + docid_fid_facet_strings_chunk.clone(), + ))); Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) }, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d6fbd3e93..98b0aa80e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -268,7 +268,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }); if let Err(e) = result { - lmdb_writer_sx.send(Err(e)).unwrap(); + let _ = lmdb_writer_sx.send(Err(e)); } // needs to be droped to avoid channel waiting lock. diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e8790af16..84333addb 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -95,7 +95,7 @@ pub(crate) fn write_typed_chunk_into_index( // we use the key to construct the words fst. builder.insert(word)?; } - let fst = builder.into_set().map_data(std::borrow::Cow::Owned).unwrap(); + let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?; let db_fst = index.words_fst(wtxn)?; // merge new fst with database fst From 9452fabfb2ed590db1a7bde089c87e9b41f5a561 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 26 Aug 2021 15:56:24 +0200 Subject: [PATCH 10/15] Optimize cbo roaring bitmaps merge --- .../cbo_roaring_bitmap_codec.rs | 76 +++++++++++++++++++ .../helpers/merge_functions.rs | 53 ++----------- .../src/update/index_documents/typed_chunk.rs | 15 +++- 3 files changed, 93 insertions(+), 51 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 53f64d648..c0e984d44 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -52,6 +52,46 @@ impl CboRoaringBitmapCodec { RoaringBitmap::deserialize_from(bytes) } } + + /// Merge serialized CboRoaringBitmaps in a buffer. + /// + /// if the merged values len is under the threshold, + /// values are directly serialized in the buffer; + /// else a RoaringBitmap is created from the values and is serialized in the buffer. + pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { + let mut roaring = RoaringBitmap::new(); + let mut vec = Vec::new(); + + for bytes in slices { + if bytes.len() <= THRESHOLD * size_of::() { + let mut reader = bytes.as_ref(); + while let Ok(integer) = reader.read_u32::() { + vec.push(integer); + } + } else { + roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?; + } + } + + if roaring.is_empty() { + vec.sort_unstable(); + vec.dedup(); + + if vec.len() <= THRESHOLD { + for integer in vec { + buffer.extend_from_slice(&integer.to_ne_bytes()); + } + } else { + let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()); + roaring.serialize_into(buffer)?; + } + } else { + roaring.extend(vec); + roaring.serialize_into(buffer)?; + } + + Ok(()) + } } impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { @@ -106,4 +146,40 @@ mod tests { assert!(roaring_size > bo_size); } + + #[test] + fn merge_cbo_roaring_bitmaps() { + let mut buffer = Vec::new(); + + let small_data = vec![ + RoaringBitmap::from_sorted_iter(1..4), + RoaringBitmap::from_sorted_iter(2..5), + RoaringBitmap::from_sorted_iter(4..6), + RoaringBitmap::from_sorted_iter(1..3), + ]; + + let small_data: Vec<_> = + small_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); + CboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap(); + let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); + let expected = RoaringBitmap::from_sorted_iter(1..6); + assert_eq!(bitmap, expected); + + let medium_data = vec![ + RoaringBitmap::from_sorted_iter(1..4), + RoaringBitmap::from_sorted_iter(2..5), + RoaringBitmap::from_sorted_iter(4..8), + RoaringBitmap::from_sorted_iter(0..3), + RoaringBitmap::from_sorted_iter(7..23), + ]; + + let medium_data: Vec<_> = + medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); + buffer.clear(); + CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap(); + + let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); + let expected = RoaringBitmap::from_sorted_iter(0..23); + assert_eq!(bitmap, expected); + } } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 6a592e54d..c5385e347 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -120,52 +120,11 @@ pub fn merge_cbo_roaring_bitmaps<'a>( _key: &[u8], values: &[Cow<'a, [u8]>], ) -> Result> { - match values.split_first().unwrap() { - (head, []) => Ok(head.clone()), - (head, tail) => { - let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; - - for value in tail { - head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; - } - - let mut vec = Vec::new(); - CboRoaringBitmapCodec::serialize_into(&head, &mut vec); - Ok(Cow::from(vec)) - } + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let mut vec = Vec::new(); + CboRoaringBitmapCodec::merge_into(values, &mut vec)?; + Ok(Cow::from(vec)) } } - -// /// Uses the FacetStringLevelZeroValueCodec to merge the values. -// pub fn tuple_string_cbo_roaring_bitmap_merge<'a>( -// _key: &[u8], -// values: &[Cow<[u8]>], -// ) -> Result> { -// let (head, tail) = values.split_first().unwrap(); -// let (head_string, mut head_rb) = FacetStringLevelZeroValueCodec::bytes_decode(&head[..]) -// .ok_or(SerializationError::Decoding { db_name: None })?; - -// for value in tail { -// let (_string, rb) = FacetStringLevelZeroValueCodec::bytes_decode(&value[..]) -// .ok_or(SerializationError::Decoding { db_name: None })?; -// head_rb |= rb; -// } - -// FacetStringLevelZeroValueCodec::bytes_encode(&(head_string, head_rb)) -// .map(|cow| cow.into_owned()) -// .ok_or(SerializationError::Encoding { db_name: None }) -// .map_err(Into::into) -// } - -// pub fn cbo_roaring_bitmap_merge<'a>(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { -// let (head, tail) = values.split_first().unwrap(); -// let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; - -// for value in tail { -// head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; -// } - -// let mut vec = Vec::new(); -// CboRoaringBitmapCodec::serialize_into(&head, &mut vec); -// Ok(vec) -// } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 84333addb..c3c71bbf4 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -188,15 +188,22 @@ fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec Ok(serialize_roaring_bitmap(&value, buffer)?) } +use std::borrow::Cow; + fn merge_cbo_roaring_bitmaps( new_value: &[u8], db_value: &[u8], buffer: &mut Vec, ) -> Result<()> { - let new_value = CboRoaringBitmapCodec::deserialize_from(new_value)?; - let db_value = CboRoaringBitmapCodec::deserialize_from(db_value)?; - let value = new_value | db_value; - Ok(CboRoaringBitmapCodec::serialize_into(&value, buffer)) + Ok(CboRoaringBitmapCodec::merge_into( + &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], + buffer, + )?) + + // let new_value = CboRoaringBitmapCodec::deserialize_from(new_value)?; + // let db_value = CboRoaringBitmapCodec::deserialize_from(db_value)?; + // let value = new_value | db_value; + // Ok(CboRoaringBitmapCodec::serialize_into(&value, buffer)) } /// Write provided entries in database using serialize_value function. From b3a22f31f6d8b87110a7ef330223b31c7eadcb20 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 30 Aug 2021 13:43:41 +0200 Subject: [PATCH 11/15] Fix memory consuption in word pair proximity extractor --- .../extract/extract_word_pair_proximity_docids.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 96bd965d8..ce75c319e 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -29,7 +29,7 @@ pub fn extract_word_pair_proximity_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory, + max_memory.map(|m| m / 2), ); let mut number_of_documents = 0; From 4860fd452965d234a11cc8430309bd9782a21bfd Mon Sep 17 00:00:00 2001 From: many Date: Wed, 1 Sep 2021 16:24:58 +0200 Subject: [PATCH 12/15] Ignore empty facet values --- .../index_documents/extract/extract_fid_docid_facet_values.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 08f2cadf0..c46329f61 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -66,7 +66,7 @@ pub fn extract_fid_docid_facet_values( } // insert normalized and original facet string in sorter - for (normalized, original) in strings { + for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { key_buffer.truncate(size_of::() + size_of::()); key_buffer.extend_from_slice(normalized.as_bytes()); fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; From db0c681baef3f0f0e90c5c5feb56cc3ec1509248 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 2 Sep 2021 15:17:52 +0200 Subject: [PATCH 13/15] Fix Pr comments --- milli/Cargo.toml | 1 - .../facet_string_level_zero_value_codec.rs | 3 +- .../cbo_roaring_bitmap_codec.rs | 6 ++-- .../extract/extract_fid_docid_facet_values.rs | 9 +++--- .../extract/extract_fid_word_count_docids.rs | 5 ++- .../extract/extract_word_docids.rs | 5 ++- .../extract_word_level_position_docids.rs | 6 +++- .../extract_word_pair_proximity_docids.rs | 31 +++---------------- .../src/update/index_documents/extract/mod.rs | 1 + .../index_documents/helpers/clonable_mmap.rs | 2 ++ .../index_documents/helpers/grenad_helpers.rs | 12 ++++--- .../src/update/index_documents/helpers/mod.rs | 4 --- milli/src/update/index_documents/mod.rs | 3 +- .../src/update/index_documents/typed_chunk.rs | 8 +---- 14 files changed, 38 insertions(+), 58 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index edcec4d5b..8616dcf4a 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -6,7 +6,6 @@ edition = "2018" [dependencies] bstr = "0.2.15" -byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } byteorder = "1.4.2" chrono = { version = "0.4.19", features = ["serde"] } concat-arrays = "0.1.2" diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs index 914d7c3cd..22031c474 100644 --- a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -5,6 +5,7 @@ use std::{marker, str}; use crate::error::SerializationError; use crate::heed_codec::RoaringBitmapCodec; use crate::{try_split_array_at, try_split_at, Result}; + pub type FacetStringLevelZeroValueCodec = StringValueCodec; /// A codec that encodes a string in front of a value. @@ -22,7 +23,6 @@ where fn bytes_decode(bytes: &'a [u8]) -> Option { let (string, bytes) = decode_prefix_string(bytes)?; - C::bytes_decode(bytes).map(|item| (string, item)) } } @@ -49,7 +49,6 @@ pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> { let original_length = u16::from_be_bytes(original_length_bytes) as usize; let (string, bytes) = try_split_at(bytes, original_length)?; let string = str::from_utf8(string).ok()?; - Some((string, bytes)) } diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index c0e984d44..519997274 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -55,9 +55,9 @@ impl CboRoaringBitmapCodec { /// Merge serialized CboRoaringBitmaps in a buffer. /// - /// if the merged values len is under the threshold, - /// values are directly serialized in the buffer; - /// else a RoaringBitmap is created from the values and is serialized in the buffer. + /// if the merged values length is under the threshold, values are directly + /// serialized in the buffer else a RoaringBitmap is created from the + /// values and is serialized in the buffer. pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { let mut roaring = RoaringBitmap::new(); let mut vec = Vec::new(); diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index c46329f61..a1bf0b1e3 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -58,11 +58,12 @@ pub fn extract_fid_docid_facet_values( // insert facet numbers in sorter for number in numbers { key_buffer.truncate(size_of::() + size_of::()); - let value_bytes = f64_into_bytes(number).unwrap(); // invalid float - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); - fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; + fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; + } } // insert normalized and original facet string in sorter diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index cf698507d..1fbc55714 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -8,6 +8,8 @@ use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, try_split_array_at, GrenadParameters, MergeFn, }; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::proximity::extract_position; use crate::{DocumentId, FieldId, Result}; @@ -36,7 +38,8 @@ pub fn extract_fid_word_count_docids( let mut current_document_id = None; while let Some((key, value)) = docid_word_positions.next()? { - let (document_id_bytes, _word_bytes) = try_split_array_at(key).unwrap(); + let (document_id_bytes, _word_bytes) = try_split_array_at(key) + .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let curr_document_id = *current_document_id.get_or_insert(document_id); diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 8ca8e39eb..6d99fda44 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -8,6 +8,8 @@ use super::helpers::{ create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, try_split_array_at, GrenadParameters, }; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::Result; /// Extracts the word and the documents ids where this word appear. @@ -31,7 +33,8 @@ pub fn extract_word_docids( let mut value_buffer = Vec::new(); while let Some((key, _value)) = docid_word_positions.next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let bitmap = RoaringBitmap::from_iter(Some(document_id)); diff --git a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs index e099b0b49..04cedf5c7 100644 --- a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs @@ -5,7 +5,10 @@ use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, try_split_array_at, GrenadParameters, }; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::{DocumentId, Result}; + /// Extracts the word positions and the documents ids where this word appear. /// /// Returns a grenad reader with the list of extracted words at positions and @@ -27,7 +30,8 @@ pub fn extract_word_level_position_docids( let mut key_buffer = Vec::new(); while let Some((key, value)) = docid_word_positions.next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = DocumentId::from_be_bytes(document_id_bytes); for position in read_u32_ne_bytes(value) { diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index ce75c319e..982799a65 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -1,15 +1,14 @@ use std::cmp::Ordering; use std::collections::{BinaryHeap, HashMap}; use std::fs::File; -use std::time::{Duration, Instant}; use std::{cmp, io, mem, str, vec}; -use log::debug; - use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, try_split_array_at, GrenadParameters, MergeFn, }; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::proximity::{positions_proximity, MAX_DISTANCE}; use crate::{DocumentId, Result}; @@ -32,16 +31,13 @@ pub fn extract_word_pair_proximity_docids( max_memory.map(|m| m / 2), ); - let mut number_of_documents = 0; - let mut total_time_aggregation = Duration::default(); - let mut total_time_grenad_insert = Duration::default(); - // This map is assumed to not consume a lot of memory. let mut document_word_positions_heap = BinaryHeap::new(); let mut current_document_id = None; while let Some((key, value)) = docid_word_positions.next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let word = str::from_utf8(word_bytes)?; @@ -52,10 +48,7 @@ pub fn extract_word_pair_proximity_docids( curr_document_id, document_word_positions_heap, &mut word_pair_proximity_docids_sorter, - &mut total_time_aggregation, - &mut total_time_grenad_insert, )?; - number_of_documents += 1; current_document_id = Some(document_id); } @@ -74,18 +67,9 @@ pub fn extract_word_pair_proximity_docids( document_id, document_word_positions_heap, &mut word_pair_proximity_docids_sorter, - &mut total_time_aggregation, - &mut total_time_grenad_insert, )?; } - debug!( - "Number of documents {} - - we took {:02?} to aggregate proximities - - we took {:02?} to grenad insert those proximities", - number_of_documents, total_time_aggregation, total_time_grenad_insert, - ); - sorter_into_reader(word_pair_proximity_docids_sorter, indexer) } @@ -97,10 +81,7 @@ fn document_word_positions_into_sorter<'b>( document_id: DocumentId, mut word_positions_heap: BinaryHeap>>, word_pair_proximity_docids_sorter: &mut grenad::Sorter, - total_time_aggregation: &mut Duration, - total_time_grenad_insert: &mut Duration, ) -> Result<()> { - let before_aggregating = Instant::now(); let mut word_pair_proximity = HashMap::new(); let mut ordered_peeked_word_positions = Vec::new(); while !word_positions_heap.is_empty() { @@ -152,8 +133,6 @@ fn document_word_positions_into_sorter<'b>( } } - *total_time_aggregation += before_aggregating.elapsed(); - let mut key_buffer = Vec::new(); for ((w1, w2), prox) in word_pair_proximity { key_buffer.clear(); @@ -162,9 +141,7 @@ fn document_word_positions_into_sorter<'b>( key_buffer.extend_from_slice(w2.as_bytes()); key_buffer.push(prox as u8); - let before_grenad_insert = Instant::now(); word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; - *total_time_grenad_insert += before_grenad_insert.elapsed(); } Ok(()) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 04c57b0fa..bb49e3e51 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -225,5 +225,6 @@ fn extract_documents_data( Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) }, ); + Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) } diff --git a/milli/src/update/index_documents/helpers/clonable_mmap.rs b/milli/src/update/index_documents/helpers/clonable_mmap.rs index b16c080ff..691d10593 100644 --- a/milli/src/update/index_documents/helpers/clonable_mmap.rs +++ b/milli/src/update/index_documents/helpers/clonable_mmap.rs @@ -2,6 +2,8 @@ use std::sync::Arc; use memmap::Mmap; +/// Wrapper around Mmap allowing to virtualy clone grenad-chunks +/// in a parallel process like the indexing. #[derive(Debug, Clone)] pub struct ClonableMmap { inner: Arc, diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 9dd261f73..1dfaaf945 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -3,7 +3,6 @@ use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::time::Instant; -use byte_unit::Byte; use grenad::{CompressionType, MergerIter, Reader, Sorter}; use heed::types::ByteSlice; use log::debug; @@ -113,6 +112,9 @@ impl Default for GrenadParameters { } impl GrenadParameters { + /// This function use the number of threads in the current threadpool to compute the value. + /// This should be called inside of a rayon thread pool, + /// Otherwise, it will take the global number of threads. pub fn max_memory_by_thread(&self) -> Option { self.max_memory.map(|max_memory| max_memory / rayon::current_num_threads()) } @@ -128,7 +130,7 @@ pub fn grenad_obkv_into_chunks( mut reader: grenad::Reader, indexer: GrenadParameters, log_frequency: Option, - documents_chunk_size: Byte, + documents_chunk_size: usize, ) -> Result>>> { let mut document_count = 0; let mut continue_reading = true; @@ -157,7 +159,7 @@ pub fn grenad_obkv_into_chunks( debug!("reached {} chunked documents", document_count); } - if current_chunk_size >= documents_chunk_size.get_bytes() { + if current_chunk_size >= documents_chunk_size as u64 { return writer_into_reader(obkv_documents).map(Some); } } @@ -170,8 +172,8 @@ pub fn grenad_obkv_into_chunks( let result = transposer().transpose(); if result.as_ref().map_or(false, |r| r.is_ok()) { debug!( - "A new chunk of approximately {} has been generated", - documents_chunk_size.get_appropriate_unit(true), + "A new chunk of approximately {:.2} MiB has been generated", + documents_chunk_size as f64 / 1024.0 / 1024.0, ); } result diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index baacb0a1b..3f38d4f25 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -40,10 +40,6 @@ where Some((head, tail)) } -// pub fn pretty_thousands, T: fmt::Display>(number: A) -> String { -// thousands::Separable::separate_with_spaces(number.borrow()) -// } - pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 98b0aa80e..b27f2042f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -9,7 +9,6 @@ use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; use std::time::Instant; -use byte_unit::Byte; use chrono::Utc; use crossbeam_channel::{Receiver, Sender}; use grenad::{self, CompressionType}; @@ -252,7 +251,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { documents_file, params.clone(), self.log_every_n, - Byte::from_bytes(self.documents_chunk_size.unwrap_or(1024 * 1024 * 128) as u64), // 128MiB + self.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB ); let result = chunk_iter.map(|chunk_iter| { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index c3c71bbf4..5f28034fe 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::fs::File; use heed::types::ByteSlice; @@ -188,8 +189,6 @@ fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec Ok(serialize_roaring_bitmap(&value, buffer)?) } -use std::borrow::Cow; - fn merge_cbo_roaring_bitmaps( new_value: &[u8], db_value: &[u8], @@ -199,11 +198,6 @@ fn merge_cbo_roaring_bitmaps( &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], buffer, )?) - - // let new_value = CboRoaringBitmapCodec::deserialize_from(new_value)?; - // let db_value = CboRoaringBitmapCodec::deserialize_from(db_value)?; - // let value = new_value | db_value; - // Ok(CboRoaringBitmapCodec::serialize_into(&value, buffer)) } /// Write provided entries in database using serialize_value function. From 7f7fafb8579d52aa3ae954d44395e115c093ffb0 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 2 Sep 2021 15:25:39 +0200 Subject: [PATCH 14/15] Make document_chunk_size settable from update builder --- milli/src/update/index_documents/mod.rs | 1 + milli/src/update/settings.rs | 3 +++ milli/src/update/update_builder.rs | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b27f2042f..e4c798163 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -281,6 +281,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { log_every_n: self.log_every_n, max_nb_chunks: self.max_nb_chunks, max_memory: self.max_memory, + documents_chunk_size: self.documents_chunk_size, chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, thread_pool: self.thread_pool, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ef23286ae..f1b3e2628 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -65,6 +65,7 @@ pub struct Settings<'a, 't, 'u, 'i> { pub(crate) log_every_n: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, + pub(crate) documents_chunk_size: Option, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, @@ -93,6 +94,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { log_every_n: None, max_nb_chunks: None, max_memory: None, + documents_chunk_size: None, chunk_compression_type: CompressionType::None, chunk_compression_level: None, thread_pool: None, @@ -227,6 +229,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.log_every_n = self.log_every_n; indexing_builder.max_nb_chunks = self.max_nb_chunks; indexing_builder.max_memory = self.max_memory; + indexing_builder.documents_chunk_size = self.documents_chunk_size; indexing_builder.chunk_compression_type = self.chunk_compression_type; indexing_builder.chunk_compression_level = self.chunk_compression_level; indexing_builder.thread_pool = self.thread_pool; diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 6035499b3..561c4bc50 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -7,6 +7,7 @@ use crate::{Index, Result}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, pub(crate) max_nb_chunks: Option, + pub(crate) documents_chunk_size: Option, pub(crate) max_memory: Option, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, @@ -19,6 +20,7 @@ impl<'a> UpdateBuilder<'a> { UpdateBuilder { log_every_n: None, max_nb_chunks: None, + documents_chunk_size: None, max_memory: None, chunk_compression_type: CompressionType::None, chunk_compression_level: None, @@ -39,6 +41,10 @@ impl<'a> UpdateBuilder<'a> { self.max_memory = Some(max_memory); } + pub fn documents_chunk_size(&mut self, documents_chunk_size: usize) { + self.documents_chunk_size = Some(documents_chunk_size); + } + pub fn chunk_compression_type(&mut self, chunk_compression_type: CompressionType) { self.chunk_compression_type = chunk_compression_type; } @@ -77,6 +83,7 @@ impl<'a> UpdateBuilder<'a> { builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; + builder.documents_chunk_size = self.documents_chunk_size; builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.thread_pool = self.thread_pool; @@ -94,6 +101,7 @@ impl<'a> UpdateBuilder<'a> { builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; + builder.documents_chunk_size = self.documents_chunk_size; builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.thread_pool = self.thread_pool; From 741a4444a9b8eb115db6585fcc4b2ecefc9ba52c Mon Sep 17 00:00:00 2001 From: many Date: Thu, 2 Sep 2021 16:57:46 +0200 Subject: [PATCH 15/15] Remove log in chunk generator --- .../index_documents/helpers/grenad_helpers.rs | 18 +----------------- milli/src/update/index_documents/mod.rs | 1 - milli/tests/search/mod.rs | 2 +- 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 1dfaaf945..fbdf2b42e 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -129,10 +129,8 @@ impl GrenadParameters { pub fn grenad_obkv_into_chunks( mut reader: grenad::Reader, indexer: GrenadParameters, - log_frequency: Option, documents_chunk_size: usize, ) -> Result>>> { - let mut document_count = 0; let mut continue_reading = true; let indexer_clone = indexer.clone(); @@ -154,11 +152,6 @@ pub fn grenad_obkv_into_chunks( obkv_documents.insert(document_id, obkv)?; current_chunk_size += document_id.len() as u64 + obkv.len() as u64; - document_count += 1; - if log_frequency.map_or(false, |log_frequency| document_count % log_frequency == 0) { - debug!("reached {} chunked documents", document_count); - } - if current_chunk_size >= documents_chunk_size as u64 { return writer_into_reader(obkv_documents).map(Some); } @@ -168,16 +161,7 @@ pub fn grenad_obkv_into_chunks( writer_into_reader(obkv_documents).map(Some) }; - Ok(std::iter::from_fn(move || { - let result = transposer().transpose(); - if result.as_ref().map_or(false, |r| r.is_ok()) { - debug!( - "A new chunk of approximately {:.2} MiB has been generated", - documents_chunk_size as f64 / 1024.0 / 1024.0, - ); - } - result - })) + Ok(std::iter::from_fn(move || transposer().transpose())) } pub fn write_into_lmdb_database( diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e4c798163..7800ae55a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -250,7 +250,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let chunk_iter = grenad_obkv_into_chunks( documents_file, params.clone(), - self.log_every_n, self.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB ); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index a533a4cbe..0fbc0e1b6 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -5,7 +5,7 @@ use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::update::{IndexDocuments, Settings, UpdateBuilder, UpdateFormat}; +use milli::update::{Settings, UpdateBuilder, UpdateFormat}; use milli::{AscDesc, Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy;