From c683fa98e6bccad193cd9affc3bdffa0f8f99087 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 28 Aug 2024 18:45:16 +0200 Subject: [PATCH 001/247] WIP Co-authored-by: Kerollmops Co-authored-by: ManyTheFish --- Cargo.lock | 33 +- milli/Cargo.toml | 5 + milli/src/index.rs | 8 + milli/src/update/mod.rs | 1 + milli/src/update/new/document_change.rs | 78 ++++ milli/src/update/new/extract/cache.rs | 248 +++++++++++ .../update/new/extract/extract_word_docids.rs | 84 ++++ milli/src/update/new/extract/mod.rs | 2 + .../update/new/extract/tokenize_document.rs | 195 +++++++++ milli/src/update/new/global_fields_ids_map.rs | 65 +++ milli/src/update/new/items_pool.rs | 54 +++ milli/src/update/new/mod.rs | 414 ++++++++++++++++++ 12 files changed, 1184 insertions(+), 3 deletions(-) create mode 100644 milli/src/update/new/document_change.rs create mode 100644 milli/src/update/new/extract/cache.rs create mode 100644 milli/src/update/new/extract/extract_word_docids.rs create mode 100644 milli/src/update/new/extract/mod.rs create mode 100644 milli/src/update/new/extract/tokenize_document.rs create mode 100644 milli/src/update/new/global_fields_ids_map.rs create mode 100644 milli/src/update/new/items_pool.rs create mode 100644 milli/src/update/new/mod.rs diff --git a/Cargo.lock b/Cargo.lock index dd67520ea..c3e9532e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2230,6 +2230,16 @@ dependencies = [ "tempfile", ] +[[package]] +name = "grenad" +version = "0.4.7" +source = "git+https://github.com/meilisearch/grenad?branch=various-improvements#d7512aedb854c247acc7cd18d0bfa148d3779923" +dependencies = [ + "bytemuck", + "byteorder", + "tempfile", +] + [[package]] name = "h2" version = "0.3.26" @@ -3313,6 +3323,15 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "lru" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" +dependencies = [ + "hashbrown 0.14.3", +] + [[package]] name = "lzma-rs" version = "0.3.0" @@ -3415,7 +3434,7 @@ dependencies = [ "mimalloc", "mime", "num_cpus", - "obkv", + "obkv 0.2.2", "once_cell", "ordered-float", "parking_lot", @@ -3565,7 +3584,8 @@ dependencies = [ "fst", "fxhash", "geoutils", - "grenad", + "grenad 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)", + "grenad 0.4.7 (git+https://github.com/meilisearch/grenad?branch=various-improvements)", "heed", "hf-hub", "indexmap", @@ -3574,13 +3594,15 @@ dependencies = [ "json-depth-checker", "levenshtein_automata", "liquid", + "lru", "maplit", "md5", "meili-snap", "memchr", "memmap2", "mimalloc", - "obkv", + "obkv 0.2.2", + "obkv 0.3.0", "once_cell", "ordered-float", "rand", @@ -3833,6 +3855,11 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2e27bcfe835a379d32352112f6b8dbae2d99d16a5fff42abe6e5ba5386c1e5a" +[[package]] +name = "obkv" +version = "0.3.0" +source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#d248eb7edd3453ff758afc2883f6ae25684eb69e" + [[package]] name = "once_cell" version = "1.19.0" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 79b61b4f1..9fa270d46 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -30,6 +30,9 @@ grenad = { version = "0.4.7", default-features = false, features = [ "rayon", "tempfile", ] } +grenad2 = { package = "grenad", version = "0.4.7", default-features = false, features = [ + "tempfile" +], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" } heed = { version = "0.20.3", default-features = false, features = [ "serde-json", "serde-bincode", @@ -38,9 +41,11 @@ heed = { version = "0.20.3", default-features = false, features = [ indexmap = { version = "2.2.6", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } +lru = "0.12.3" memchr = "2.5.0" memmap2 = "0.9.4" obkv = "0.2.2" +obkv2 = { package = "obkv", git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } once_cell = "1.19.0" ordered-float = "4.2.1" rayon = "1.10.0" diff --git a/milli/src/index.rs b/milli/src/index.rs index 512e911aa..5d651e144 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1251,6 +1251,14 @@ impl Index { /* documents */ + /// Returns a document by using the document id. + pub fn document<'t>(&self, rtxn: &'t RoTxn, id: DocumentId) -> Result> { + self.documents + .get(rtxn, &id)? + .ok_or(UserError::UnknownInternalDocumentId { document_id: id }) + .map_err(Into::into) + } + /// Returns an iterator over the requested documents. The next item will be an error if a document is missing. pub fn iter_documents<'a, 't: 'a>( &'a self, diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 195b95d1e..adfc85174 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -19,6 +19,7 @@ pub(crate) mod del_add; pub(crate) mod facet; mod index_documents; mod indexer_config; +mod new; mod settings; mod update_step; mod word_prefix_docids; diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs new file mode 100644 index 000000000..e7c8bf012 --- /dev/null +++ b/milli/src/update/new/document_change.rs @@ -0,0 +1,78 @@ +use heed::RoTxn; +use obkv2::KvReader; + +use super::indexer::KvReaderFieldId; +use crate::{DocumentId, FieldId}; + +pub enum DocumentChange { + Deletion(Deletion), + Update(Update), + Insertion(Insertion), +} + +pub struct Deletion { + docid: DocumentId, + external_docid: String, // ? + current: Box, +} + +pub struct Update { + docid: DocumentId, + external_docid: String, // ? + current: Box, + new: Box, +} + +pub struct Insertion { + docid: DocumentId, + external_docid: String, // ? + new: Box, +} + +impl DocumentChange { + fn docid(&self) -> DocumentId { + match &self { + Self::Deletion(inner) => inner.docid(), + Self::Update(inner) => inner.docid(), + Self::Insertion(inner) => inner.docid(), + } + } +} + +impl Deletion { + pub fn new(docid: DocumentId, external_docid: String, current: Box) -> Self { + Self { docid, external_docid, current } + } + + fn docid(&self) -> DocumentId { + self.docid + } + + fn current(&self, rtxn: &RoTxn) -> &KvReader { + unimplemented!() + } +} + +impl Insertion { + fn docid(&self) -> DocumentId { + self.docid + } + + fn new(&self) -> &KvReader { + unimplemented!() + } +} + +impl Update { + fn docid(&self) -> DocumentId { + self.docid + } + + fn current(&self, rtxn: &RoTxn) -> &KvReader { + unimplemented!() + } + + fn new(&self) -> &KvReader { + unimplemented!() + } +} diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs new file mode 100644 index 000000000..0d72a5a8d --- /dev/null +++ b/milli/src/update/new/extract/cache.rs @@ -0,0 +1,248 @@ +use std::borrow::Cow; +use std::num::NonZeroUsize; +use std::{io, mem}; + +use grenad2::{MergeFunction, Sorter}; +use lru::LruCache; +use roaring::RoaringBitmap; +use smallvec::SmallVec; + +use crate::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; + +#[derive(Debug)] +pub struct CachedSorter { + cache: lru::LruCache, DelAddRoaringBitmap>, + sorter: Sorter, + deladd_buffer: Vec, + cbo_buffer: Vec, +} + +impl CachedSorter { + pub fn new(cap: NonZeroUsize, sorter: Sorter) -> Self { + CachedSorter { + cache: lru::LruCache::new(cap), + sorter, + deladd_buffer: Vec::new(), + cbo_buffer: Vec::new(), + } + } +} + +impl CachedSorter { + pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { + match self.cache.get_mut(key) { + Some(DelAddRoaringBitmap { del, add: _ }) => { + del.get_or_insert_with(RoaringBitmap::new).insert(n); + } + None => { + let value = DelAddRoaringBitmap::new_del_u32(n); + if let Some((key, deladd)) = self.cache.push(key.into(), value) { + self.write_entry(key, deladd)?; + } + } + } + + Ok(()) + } + + pub fn insert_del( + &mut self, + key: &[u8], + bitmap: RoaringBitmap, + ) -> grenad::Result<(), MF::Error> { + match self.cache.get_mut(key) { + Some(DelAddRoaringBitmap { del, add: _ }) => { + *del.get_or_insert_with(RoaringBitmap::new) |= bitmap; + } + None => { + let value = DelAddRoaringBitmap::new_del(bitmap); + if let Some((key, deladd)) = self.cache.push(key.into(), value) { + self.write_entry(key, deladd)?; + } + } + } + + Ok(()) + } + + pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { + match self.cache.get_mut(key) { + Some(DelAddRoaringBitmap { del: _, add }) => { + add.get_or_insert_with(RoaringBitmap::new).insert(n); + } + None => { + let value = DelAddRoaringBitmap::new_add_u32(n); + if let Some((key, deladd)) = self.cache.push(key.into(), value) { + self.write_entry(key, deladd)?; + } + } + } + + Ok(()) + } + + pub fn insert_add( + &mut self, + key: &[u8], + bitmap: RoaringBitmap, + ) -> grenad::Result<(), MF::Error> { + match self.cache.get_mut(key) { + Some(DelAddRoaringBitmap { del: _, add }) => { + *add.get_or_insert_with(RoaringBitmap::new) |= bitmap; + } + None => { + let value = DelAddRoaringBitmap::new_add(bitmap); + if let Some((key, deladd)) = self.cache.push(key.into(), value) { + self.write_entry(key, deladd)?; + } + } + } + + Ok(()) + } + + pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { + match self.cache.get_mut(key) { + Some(DelAddRoaringBitmap { del, add }) => { + del.get_or_insert_with(RoaringBitmap::new).insert(n); + add.get_or_insert_with(RoaringBitmap::new).insert(n); + } + None => { + let value = DelAddRoaringBitmap::new_del_add_u32(n); + if let Some((key, deladd)) = self.cache.push(key.into(), value) { + self.write_entry(key, deladd)?; + } + } + } + + Ok(()) + } + + fn write_entry>( + &mut self, + key: A, + deladd: DelAddRoaringBitmap, + ) -> grenad::Result<(), MF::Error> { + self.deladd_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut self.deladd_buffer); + match deladd { + DelAddRoaringBitmap { del: Some(del), add: None } => { + self.cbo_buffer.clear(); + RoaringBitmap::serialize_into(&del, &mut self.cbo_buffer)?; + value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?; + } + DelAddRoaringBitmap { del: None, add: Some(add) } => { + self.cbo_buffer.clear(); + RoaringBitmap::serialize_into(&add, &mut self.cbo_buffer)?; + value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?; + } + DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { + self.cbo_buffer.clear(); + RoaringBitmap::serialize_into(&del, &mut self.cbo_buffer)?; + value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?; + + self.cbo_buffer.clear(); + RoaringBitmap::serialize_into(&add, &mut self.cbo_buffer)?; + value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?; + } + DelAddRoaringBitmap { del: None, add: None } => return Ok(()), + } + let bytes = value_writer.into_inner().unwrap(); + self.sorter.insert(key, bytes) + } + + pub fn direct_insert(&mut self, key: &[u8], val: &[u8]) -> grenad::Result<(), MF::Error> { + self.sorter.insert(key, val) + } + + pub fn into_sorter(mut self) -> grenad::Result, MF::Error> { + let default_arc = LruCache::new(NonZeroUsize::MIN); + for (key, deladd) in mem::replace(&mut self.cache, default_arc) { + self.write_entry(key, deladd)?; + } + Ok(self.sorter) + } +} + +#[derive(Debug, Clone)] +pub struct DelAddRoaringBitmap { + pub del: Option, + pub add: Option, +} + +impl DelAddRoaringBitmap { + fn new_del_add_u32(n: u32) -> Self { + DelAddRoaringBitmap { + del: Some(RoaringBitmap::from([n])), + add: Some(RoaringBitmap::from([n])), + } + } + + fn new_del(bitmap: RoaringBitmap) -> Self { + DelAddRoaringBitmap { del: Some(bitmap), add: None } + } + + fn new_del_u32(n: u32) -> Self { + DelAddRoaringBitmap { del: Some(RoaringBitmap::from([n])), add: None } + } + + fn new_add(bitmap: RoaringBitmap) -> Self { + DelAddRoaringBitmap { del: None, add: Some(bitmap) } + } + + fn new_add_u32(n: u32) -> Self { + DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) } + } +} + +/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv +/// separately and outputs a new DelAdd with both unions. +pub struct DelAddRoaringBitmapMerger; + +impl MergeFunction for DelAddRoaringBitmapMerger { + type Error = io::Error; + + fn merge<'a>( + &self, + _key: &[u8], + values: &[Cow<'a, [u8]>], + ) -> std::result::Result, Self::Error> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // Retrieve the bitmaps from both sides + let mut del_bitmaps_bytes = Vec::new(); + let mut add_bitmaps_bytes = Vec::new(); + for value in values { + let obkv: &KvReaderDelAdd = value.as_ref().into(); + if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { + del_bitmaps_bytes.push(bitmap_bytes); + } + if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { + add_bitmaps_bytes.push(bitmap_bytes); + } + } + + let mut output_deladd_obkv = KvWriterDelAdd::memory(); + + // Deletion + let mut buffer = Vec::new(); + let mut merged = RoaringBitmap::new(); + for bytes in del_bitmaps_bytes { + merged |= RoaringBitmap::deserialize_unchecked_from(bytes)?; + } + merged.serialize_into(&mut buffer)?; + output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; + + // Addition + buffer.clear(); + merged.clear(); + for bytes in add_bitmaps_bytes { + merged |= RoaringBitmap::deserialize_unchecked_from(bytes)?; + } + output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; + + output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) + } + } +} diff --git a/milli/src/update/new/extract/extract_word_docids.rs b/milli/src/update/new/extract/extract_word_docids.rs new file mode 100644 index 000000000..e2e1520bc --- /dev/null +++ b/milli/src/update/new/extract/extract_word_docids.rs @@ -0,0 +1,84 @@ +pub fn extract_word_docids( + document_change: DocumentChange, + _tokenizer: &Tokenizer, + output: &mut CachedSorter, +) -> grenad::Result<(), io::Error> { + match document_change { + DocumentChange::Deletion(inner) => { + unimplemented!() + } + DocumentChange::Update(inner) => { + unimplemented!() + } + DocumentChange::Insertion(inner) => { + unimplemented!() + } + } + + let normalizer_options = NormalizerOption::default(); + + if let Some(previous_doc) = previous_doc { + for (_, v) in previous_doc.iter() { + // Only manage the direct JSON strings + // TODO manage the JSON strings correctly (escaped chars) + if v.first().zip(v.last()) == Some((&b'"', &b'"')) { + let s = std::str::from_utf8(&v[1..v.len() - 1]).unwrap(); + // for token in tokenizer.tokenize(s).filter(|t| t.is_word()) { + // let key = token.lemma().normalize(&normalizer_options); + for token in s.split_whitespace() { + let key = token.normalize(&normalizer_options); + output.insert_del_u32(key.as_bytes(), docid)?; + } + } + } + } + + for (_, v) in new_doc.iter() { + // Only manage the direct JSON strings + // TODO manage the JSON strings correctly (escaped chars) + if v.first().zip(v.last()) == Some((&b'"', &b'"')) { + let s = std::str::from_utf8(&v[1..v.len() - 1]).unwrap(); + // for token in tokenizer.tokenize(s).filter(|t| t.is_word()) { + // let key = token.lemma().normalize(&normalizer_options); + for token in s.split_whitespace() { + let key = token.normalize(&normalizer_options); + output.insert_add_u32(key.as_bytes(), docid)?; + } + } + } + + Ok(()) +} + +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standard proximity of 1 between words. +fn process_tokens<'a>( + tokens: impl Iterator>, +) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator()) + .scan((0, None), |(offset, prev_kind), mut token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => + { + *prev_kind = Some(token.kind); + } + _ => token.kind = TokenKind::Unknown, + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs new file mode 100644 index 000000000..26732d4c8 --- /dev/null +++ b/milli/src/update/new/extract/mod.rs @@ -0,0 +1,2 @@ +mod cache; +mod extract_word_docids; diff --git a/milli/src/update/new/extract/tokenize_document.rs b/milli/src/update/new/extract/tokenize_document.rs new file mode 100644 index 000000000..8793063b0 --- /dev/null +++ b/milli/src/update/new/extract/tokenize_document.rs @@ -0,0 +1,195 @@ +pub struct DocumentTokenizer { + tokenizer: &Tokenizer, + searchable_attributes: Option<&[String]>, + localized_attributes_rules: &[LocalizedAttributesRule], + max_positions_per_attributes: u32, +} + +impl DocumentTokenizer { + // pub fn new(tokenizer: &Tokenizer, settings: &InnerIndexSettings) -> Self { + // Self { tokenizer, settings } + // } + + pub fn tokenize_document<'a>( + obkv: &KvReader<'a, FieldId>, + field_id_map: &FieldsIdsMap, + token_fn: impl Fn(FieldId, u16, &str), + ) { + let mut field_position = Hashmap::new(); + for (field_id, field_bytes) in obkv { + let field_name = field_id_map.name(field_id); + + let tokenize_field = |name, value| { + let field_id = field_id_map.id(name); + match value { + Number(n) => { + let token = n.to_string(); + let position = field_position + .entry(field_id) + .and_modify(|counter| *counter += 8) + .or_insert(0); + token_fn(field_id, position, token.as_str()); + } + String(text) => { + // create an iterator of token with their positions. + let locales = self + .localized_attributes_rules + .iter() + .first(|rule| rule.match_str(field_name)) + .map(|rule| rule.locales(field_id)); + let tokens = + process_tokens(tokenizer.tokenize_with_allow_list(field, locales)) + .take_while(|(p, _)| { + (*p as u32) < self.max_positions_per_attributes + }); + + for (index, token) in tokens { + // keep a word only if it is not empty and fit in a LMDB key. + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + let position: u16 = index + .try_into() + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + writer.insert(position, token.as_bytes())?; + } + } + } + _ => (), + } + }; + + // if the current field is searchable or contains a searchable attribute + if searchable_attributes.map_or(true, |attributes| { + attributes.iter().any(|name| contained_in(name, field_name)) + }) { + // parse json. + match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { + Value::Object(object) => { + seek_leaf_values_in_object(object, selectors, &field_name, tokenize_field) + } + Value::Array(array) => { + seek_leaf_values_in_array(array, selectors, &field_name, tokenize_field) + } + value => tokenize_field(&base_key, value), + } + } + } + } +} + +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standard proximity of 1 between words. +fn process_tokens<'a>( + tokens: impl Iterator>, +) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator()) + .scan((0, None), |(offset, prev_kind), mut token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => + { + *prev_kind = Some(token.kind); + } + _ => token.kind = TokenKind::Unknown, + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} + +/// Returns `true` if the `selector` match the `key`. +/// +/// ```text +/// Example: +/// `animaux` match `animaux` +/// `animaux.chien` match `animaux` +/// `animaux.chien` match `animaux` +/// `animaux.chien.nom` match `animaux` +/// `animaux.chien.nom` match `animaux.chien` +/// ----------------------------------------- +/// `animaux` doesn't match `animaux.chien` +/// `animaux.` doesn't match `animaux` +/// `animaux.ch` doesn't match `animaux.chien` +/// `animau` doesn't match `animaux` +/// ``` +fn contained_in(selector: &str, key: &str) -> bool { + selector.starts_with(key) + && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) +} + +/// TODO move in permissive json pointer +mod perm_json_p { + pub fn seek_leaf_values<'a>( + value: &Map, + selectors: impl IntoIterator, + seeker: impl Fn(&str, &Value), + ) { + let selectors: Vec<_> = selectors.into_iter().collect(); + seek_leaf_values_in_object(value, &selectors, "", &seeker); + } + + pub fn seek_leaf_values_in_object( + value: &Map, + selectors: &[&str], + base_key: &str, + seeker: &impl Fn(&str, &Value), + ) { + for (key, value) in value.iter() { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, SPLIT_SYMBOL, key) + }; + + // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` + // so we check the contained_in on both side + let should_continue = selectors.iter().any(|selector| { + contained_in(selector, &base_key) || contained_in(&base_key, selector) + }); + + if should_continue { + match value { + Value::Object(object) => { + seek_leaf_values_in_object(object, selectors, &base_key, seeker) + } + Value::Array(array) => { + seek_leaf_values_in_array(array, selectors, &base_key, seeker) + } + value => seeker(&base_key, value), + } + } + } + } + + pub fn seek_leaf_values_in_array( + values: &mut [Value], + selectors: &[&str], + base_key: &str, + seeker: &impl Fn(&str, &Value), + ) { + for value in values.iter_mut() { + match value { + Value::Object(object) => { + seek_leaf_values_in_object(object, selectors, base_key, seeker) + } + Value::Array(array) => { + seek_leaf_values_in_array(array, selectors, base_key, seeker) + } + value => seeker(base_key, value), + } + } + } +} diff --git a/milli/src/update/new/global_fields_ids_map.rs b/milli/src/update/new/global_fields_ids_map.rs new file mode 100644 index 000000000..4bd7b27d9 --- /dev/null +++ b/milli/src/update/new/global_fields_ids_map.rs @@ -0,0 +1,65 @@ +use std::sync::{Arc, RwLock}; + +use crate::{FieldId, FieldsIdsMap}; + +/// A fields ids map that can be globally updated to add fields +pub struct GlobalFieldsIdsMap { + global: Arc>, + local: FieldsIdsMap, +} + +impl GlobalFieldsIdsMap { + pub fn new(global: FieldsIdsMap) -> Self { + Self { local: global.clone(), global: Arc::new(RwLock::new(global)) } + } + + /// Returns the number of fields ids in the map. + pub fn global_len(&self) -> usize { + todo!() + } + + /// Returns `true` if the map is empty. + pub fn global_is_empty(&self) -> bool { + todo!() + } + + /// Returns the field id related to a field name, it will create a new field id if the + /// name is not already known. Returns `None` if the maximum field id as been reached. + pub fn insert(&mut self, name: &str) -> Option { + match self.names_ids.get(name) { + Some(id) => Some(*id), + None => { + let id = self.next_id?; + self.next_id = id.checked_add(1); + self.names_ids.insert(name.to_owned(), id); + self.ids_names.insert(id, name.to_owned()); + Some(id) + } + } + } + + /// Get the id of a field based on its name. + pub fn id(&self, name: &str) -> Option { + self.names_ids.get(name).copied() + } + + /// Get the name of a field based on its id. + pub fn name(&self, id: FieldId) -> Option<&str> { + self.ids_names.get(&id).map(String::as_str) + } + + /// Iterate over the ids and names in the ids order. + pub fn iter(&self) -> impl Iterator { + self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) + } + + /// Iterate over the ids in the order of the ids. + pub fn ids(&'_ self) -> impl Iterator + '_ { + self.ids_names.keys().copied() + } + + /// Iterate over the names in the order of the ids. + pub fn names(&self) -> impl Iterator { + self.ids_names.values().map(AsRef::as_ref) + } +} diff --git a/milli/src/update/new/items_pool.rs b/milli/src/update/new/items_pool.rs new file mode 100644 index 000000000..e90ce97db --- /dev/null +++ b/milli/src/update/new/items_pool.rs @@ -0,0 +1,54 @@ +use crossbeam_channel::{Receiver, Sender, TryRecvError}; + +/// A pool of items that can be pull and generated on demand. +pub struct ItemsPool +where + F: Fn() -> Result, +{ + init: F, + sender: Sender, + receiver: Receiver, +} + +impl ItemsPool +where + F: Fn() -> Result, +{ + /// Create a new unbounded items pool with the specified function + /// to generate items when needed. + /// + /// The `init` function will be invoked whenever a call to `with` requires new items. + pub fn new(init: F) -> Self { + let (sender, receiver) = crossbeam_channel::unbounded(); + ItemsPool { init, sender, receiver } + } + + /// Consumes the pool to retrieve all remaining items. + /// + /// This method is useful for cleaning up and managing the items once they are no longer needed. + pub fn into_items(self) -> crossbeam_channel::IntoIter { + self.receiver.into_iter() + } + + /// Allows running a function on an item from the pool, + /// potentially generating a new item if the pool is empty. + pub fn with(&self, f: G) -> Result + where + G: FnOnce(&mut T) -> Result, + { + let mut item = match self.receiver.try_recv() { + Ok(item) => item, + Err(TryRecvError::Empty) => (self.init)()?, + Err(TryRecvError::Disconnected) => unreachable!(), + }; + + // Run the user's closure with the retrieved item + let result = f(&mut item); + + if let Err(e) = self.sender.send(item) { + unreachable!("error when sending into channel {e}"); + } + + result + } +} diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs new file mode 100644 index 000000000..41b04219f --- /dev/null +++ b/milli/src/update/new/mod.rs @@ -0,0 +1,414 @@ +mod document_change; +// mod extract; +mod items_pool; + +mod global_fields_ids_map; + +mod indexer { + use std::collections::{BTreeMap, HashMap}; + use std::fs::File; + use std::io::Cursor; + use std::os::unix::fs::MetadataExt; + use std::sync::Arc; + + use heed::RoTxn; + use memmap2::Mmap; + use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; + use roaring::RoaringBitmap; + use serde_json::Value; + + use super::document_change::{self, DocumentChange}; + use super::items_pool::ItemsPool; + use crate::documents::{ + obkv_to_object, DocumentIdExtractionError, DocumentsBatchReader, PrimaryKey, + }; + use crate::update::{AvailableDocumentsIds, IndexDocumentsMethod}; + use crate::{ + DocumentId, Error, FieldId, FieldsIdsMap, Index, InternalError, Result, UserError, + }; + + pub type KvReaderFieldId = obkv2::KvReader; + pub type KvWriterFieldId = obkv2::KvWriter; + + pub struct DocumentOperationIndexer { + operations: Vec, + method: IndexDocumentsMethod, + } + + enum Payload { + Addition(File), + Deletion(Vec), + } + + pub struct PayloadStats { + pub document_count: usize, + pub bytes: u64, + } + + enum DocumentOperation { + Addition(DocumentOffset), + Deletion, + } + + /// Represents an offset where a document lives + /// in an mmapped grenad reader file. + struct DocumentOffset { + /// The mmapped grenad reader file. + pub content: Arc, // grenad::Reader + /// The offset of the document in the file. + pub offset: u32, + } + + impl DocumentOperationIndexer { + pub fn new(method: IndexDocumentsMethod) -> Self { + Self { operations: Default::default(), method } + } + + /// TODO please give me a type + /// The payload is expected to be in the grenad format + pub fn add_documents(&mut self, payload: File) -> Result { + let reader = DocumentsBatchReader::from_reader(&payload)?; + let bytes = payload.metadata()?.size(); + let document_count = reader.documents_count() as usize; + + self.operations.push(Payload::Addition(payload)); + + Ok(PayloadStats { bytes, document_count }) + } + + pub fn delete_documents(&mut self, to_delete: Vec) { + self.operations.push(Payload::Deletion(to_delete)) + } + + pub fn document_changes<'a>( + self, + index: &'a Index, + rtxn: &'a RoTxn, + mut fields_ids_map: FieldsIdsMap, + primary_key: &'a PrimaryKey<'a>, + ) -> Result + 'a> { + let documents_ids = index.documents_ids(rtxn)?; + let mut available_docids = AvailableDocumentsIds::from_documents_ids(&documents_ids); + let mut docids_version_offsets = HashMap::::new(); + + for operation in self.operations { + match operation { + Payload::Addition(payload) => { + let content = unsafe { Mmap::map(&payload).map(Arc::new)? }; + let cursor = Cursor::new(content.as_ref()); + let reader = DocumentsBatchReader::from_reader(cursor)?; + + let (mut batch_cursor, batch_index) = reader.into_cursor_and_fields_index(); + // TODO Fetch all document fields to fill the fields ids map + batch_index.iter().for_each(|(_, name)| { + fields_ids_map.insert(name); + }); + + let mut offset: u32 = 0; + while let Some(document) = batch_cursor.next_document()? { + let external_document_id = + match primary_key.document_id(&document, &batch_index)? { + Ok(document_id) => Ok(document_id), + Err(DocumentIdExtractionError::InvalidDocumentId( + user_error, + )) => Err(user_error), + Err(DocumentIdExtractionError::MissingDocumentId) => { + Err(UserError::MissingDocumentId { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(&document, &batch_index)?, + }) + } + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(&document, &batch_index)?, + }) + } + }?; + + let content = content.clone(); + let document_offset = DocumentOffset { content, offset }; + let document_operation = DocumentOperation::Addition(document_offset); + + match docids_version_offsets.get_mut(&external_document_id) { + None => { + let docid = match index + .external_documents_ids() + .get(rtxn, &external_document_id)? + { + Some(docid) => docid, + None => available_docids.next().ok_or(Error::UserError( + UserError::DocumentLimitReached, + ))?, + }; + + docids_version_offsets.insert( + external_document_id.into(), + (docid, vec![document_operation]), + ); + } + Some((_, offsets)) => offsets.push(document_operation), + } + offset += 1; + } + } + Payload::Deletion(to_delete) => { + for external_document_id in to_delete { + match docids_version_offsets.get_mut(&external_document_id) { + None => { + let docid = match index + .external_documents_ids() + .get(rtxn, &external_document_id)? + { + Some(docid) => docid, + None => available_docids.next().ok_or(Error::UserError( + UserError::DocumentLimitReached, + ))?, + }; + + docids_version_offsets.insert( + external_document_id, + (docid, vec![DocumentOperation::Deletion]), + ); + } + Some((_, offsets)) => offsets.push(DocumentOperation::Deletion), + } + } + } + } + } + + let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); + docids_version_offsets.into_par_iter().map_with( + items, + |context_pool, (external_docid, (internal_docid, operations))| { + context_pool.with(|rtxn| match self.method { + IndexDocumentsMethod::ReplaceDocuments => todo!(), + // TODO Remap the documents to match the db fields_ids_map + IndexDocumentsMethod::UpdateDocuments => merge_document_obkv_for_updates( + rtxn, + index, + &fields_ids_map, + internal_docid, + external_docid, + &operations, + ), + }) + }, + ); + + Ok(vec![].into_par_iter()) + + // let mut file_count: usize = 0; + // for result in WalkDir::new(update_files_path) + // // TODO handle errors + // .sort_by_key(|entry| entry.metadata().unwrap().created().unwrap()) + // { + // let entry = result?; + // if !entry.file_type().is_file() { + // continue; + // } + + // let file = File::open(entry.path()) + // .with_context(|| format!("While opening {}", entry.path().display()))?; + // let content = unsafe { + // Mmap::map(&file) + // .map(Arc::new) + // .with_context(|| format!("While memory mapping {}", entry.path().display()))? + // }; + + // let reader = + // crate::documents::DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; + // let (mut batch_cursor, batch_index) = reader.into_cursor_and_fields_index(); + // batch_index.iter().for_each(|(_, name)| { + // fields_ids_map.insert(name); + // }); + // let mut offset: u32 = 0; + // while let Some(document) = batch_cursor.next_document()? { + // let primary_key = batch_index.id(primary_key).unwrap(); + // let document_id = document.get(primary_key).unwrap(); + // let document_id = std::str::from_utf8(document_id).unwrap(); + + // let document_offset = DocumentOffset { content: content.clone(), offset }; + // match docids_version_offsets.get_mut(document_id) { + // None => { + // let docid = match maindb.external_documents_ids.get(rtxn, document_id)? { + // Some(docid) => docid, + // None => sequential_docids.next().context("no more available docids")?, + // }; + // docids_version_offsets + // .insert(document_id.into(), (docid, smallvec![document_offset])); + // } + // Some((_, offsets)) => offsets.push(document_offset), + // } + // offset += 1; + // p.inc(1); + // } + + // file_count += 1; + // } + } + } + + pub struct DeleteDocumentIndexer { + to_delete: RoaringBitmap, + } + + impl DeleteDocumentIndexer { + pub fn new() -> Self { + Self { to_delete: Default::default() } + } + + pub fn delete_documents_by_docids(&mut self, docids: RoaringBitmap) { + self.to_delete |= docids; + } + + // let fields = index.fields_ids_map(rtxn)?; + // let primary_key = + // index.primary_key(rtxn)?.ok_or(InternalError::DatabaseMissingEntry { + // db_name: db_name::MAIN, + // key: Some(main_key::PRIMARY_KEY_KEY), + // })?; + // let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| { + // InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldName { + // field_name: primary_key.to_owned(), + // process: "external_id_of", + // }) + // })?; + pub fn document_changes<'a, F>( + self, + index: &'a Index, + fields: &'a FieldsIdsMap, + primary_key: &'a PrimaryKey<'a>, + ) -> Result> + 'a> + { + let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); + Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| { + items.with(|rtxn| { + let document = index.document(rtxn, docid)?; + let external_docid = match primary_key.document_id(&document, fields)? { + Ok(document_id) => Ok(document_id) as Result<_>, + Err(_) => Err(InternalError::DocumentsError( + crate::documents::Error::InvalidDocumentFormat, + ) + .into()), + }?; + Ok(DocumentChange::Deletion(document_change::Deletion::new( + docid, + external_docid, + ))) + }) + })) + } + } + + pub struct DumpIndexer; + + impl DumpIndexer { + pub fn new() -> Self { + todo!() + } + + pub fn document_changes_from_json_iter( + self, + iter: I, + index: &Index, + ) -> impl ParallelIterator + where + I: IntoIterator, + { + // let items = Arc::new(ItemsPool::new(|| { + // let rtxn = index.read_txn()?; + // let fields = index.fields_ids_map(&rtxn)?; + // let primary_key = + // index.primary_key(&rtxn)?.ok_or(InternalError::DatabaseMissingEntry { + // db_name: db_name::MAIN, + // key: Some(main_key::PRIMARY_KEY_KEY), + // })?; + // let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| { + // InternalError::FieldIdMapMissingEntry( + // crate::FieldIdMapMissingEntry::FieldName { + // field_name: primary_key.to_owned(), + // process: "external_id_of", + // }, + // ) + // })?; + // Ok(DeleteDocumentExternalDocumentIdGetter { rtxn, fields, primary_key }) + // as crate::Result<_> + // })); + + todo!(); + vec![].into_par_iter() + } + } + + pub struct UpdateByFunctionIndexer; + // DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))? + + /// Reads the previous version of a document from the database, the new versions + /// in the grenad update files and merges them to generate a new boxed obkv. + /// + /// This function is only meant to be used when doing an update and not a replacement. + pub fn merge_document_obkv_for_updates( + rtxn: &RoTxn, + // Let's construct the new obkv in memory + index: &Index, + fields_ids_map: &FieldsIdsMap, + docid: DocumentId, + external_docid: String, + operations: &[DocumentOperation], + ) -> Result> { + let mut document = BTreeMap::new(); + let original_obkv = + index.documents.remap_data_type::().get(rtxn, &docid)?; + let original_obkv: Option<&KvReaderFieldId> = original_obkv.map(Into::into); + + if let Some(original_obkv) = original_obkv { + original_obkv.into_iter().for_each(|(k, v)| { + document.insert(k, v.to_vec()); + }); + } + + let last_deletion = operations + .iter() + .rposition(|operation| matches!(operation, DocumentOperation::Deletion)); + + let operations = &operations[last_deletion.map_or(0, |i| i + 1)..]; + + if operations.is_empty() { + match original_obkv { + Some(original_obkv) => { + let current = original_obkv.as_bytes().to_vec().into_boxed_slice().into(); + return Ok(Some(DocumentChange::Deletion(document_change::Deletion::new( + docid, + external_docid, + current, + )))); + } + None => return Ok(None), + } + } + + for operation in operations { + let DocumentOffset { content, offset } = ; + + let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; + let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); + let obkv = cursor.get(*offset)?.expect("must exists"); + + obkv.into_iter().for_each(|(k, v)| { + let field_name = batch_index.name(k).unwrap(); + let id = fields_ids_map.id(field_name).unwrap(); + document.insert(id, v.to_vec()); + }); + } + + let mut writer = KvWriterFieldId::memory(); + document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); + let boxed = writer.into_inner().unwrap().into_boxed_slice(); + + // Box + + Ok(boxed.into()) + } +} From 637a9c8bdd1c8cf5325998f900feaf3993ead9fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 29 Aug 2024 12:06:44 +0200 Subject: [PATCH 002/247] Implement the document merge function for the update method --- milli/src/documents/reader.rs | 13 ++++++ milli/src/update/new/document_change.rs | 19 +++++++- milli/src/update/new/mod.rs | 58 ++++++++++++++++--------- 3 files changed, 69 insertions(+), 21 deletions(-) diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index c7c125c80..ebdc514fd 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -72,6 +72,19 @@ impl DocumentsBatchCursor { } impl DocumentsBatchCursor { + /// Returns a single document from the database. + pub fn get( + &mut self, + offset: u32, + ) -> Result>, DocumentsBatchCursorError> { + match self.cursor.move_on_key_equal_to(offset.to_be_bytes())? { + Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { + Ok(Some(KvReader::new(value))) + } + _otherwise => Ok(None), + } + } + /// Returns the next document, starting from the first one. Subsequent calls to /// `next_document` advance the document reader until all the documents have been read. pub fn next_document( diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index e7c8bf012..311e22404 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -40,7 +40,11 @@ impl DocumentChange { } impl Deletion { - pub fn new(docid: DocumentId, external_docid: String, current: Box) -> Self { + pub fn create( + docid: DocumentId, + external_docid: String, + current: Box, + ) -> Self { Self { docid, external_docid, current } } @@ -54,6 +58,10 @@ impl Deletion { } impl Insertion { + pub fn create(docid: DocumentId, external_docid: String, new: Box) -> Self { + Insertion { docid, external_docid, new } + } + fn docid(&self) -> DocumentId { self.docid } @@ -64,6 +72,15 @@ impl Insertion { } impl Update { + pub fn create( + docid: DocumentId, + external_docid: String, + current: Box, + new: Box, + ) -> Self { + Update { docid, external_docid, current, new } + } + fn docid(&self) -> DocumentId { self.docid } diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 41b04219f..e5d376534 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -5,19 +5,21 @@ mod items_pool; mod global_fields_ids_map; mod indexer { + use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::fs::File; use std::io::Cursor; use std::os::unix::fs::MetadataExt; use std::sync::Arc; + use heed::types::Bytes; use heed::RoTxn; use memmap2::Mmap; use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; use roaring::RoaringBitmap; use serde_json::Value; - use super::document_change::{self, DocumentChange}; + use super::document_change::{self, DocumentChange, Insertion, Update}; use super::items_pool::ItemsPool; use crate::documents::{ obkv_to_object, DocumentIdExtractionError, DocumentsBatchReader, PrimaryKey, @@ -28,7 +30,7 @@ mod indexer { }; pub type KvReaderFieldId = obkv2::KvReader; - pub type KvWriterFieldId = obkv2::KvWriter; + pub type KvWriterFieldId = obkv2::KvWriter; pub struct DocumentOperationIndexer { operations: Vec, @@ -293,9 +295,13 @@ mod indexer { ) .into()), }?; - Ok(DocumentChange::Deletion(document_change::Deletion::new( + + /// TODO create a function for this + let document = document.as_bytes().to_vec().into_boxed_slice().into(); + Ok(DocumentChange::Deletion(document_change::Deletion::create( docid, external_docid, + document, ))) }) })) @@ -358,14 +364,13 @@ mod indexer { external_docid: String, operations: &[DocumentOperation], ) -> Result> { - let mut document = BTreeMap::new(); - let original_obkv = - index.documents.remap_data_type::().get(rtxn, &docid)?; - let original_obkv: Option<&KvReaderFieldId> = original_obkv.map(Into::into); + let mut document = BTreeMap::<_, Cow<_>>::new(); + let original = index.documents.remap_data_type::().get(rtxn, &docid)?; + let original: Option<&KvReaderFieldId> = original.map(Into::into); - if let Some(original_obkv) = original_obkv { - original_obkv.into_iter().for_each(|(k, v)| { - document.insert(k, v.to_vec()); + if let Some(original) = original { + original.into_iter().for_each(|(k, v)| { + document.insert(k, v.into()); }); } @@ -376,10 +381,10 @@ mod indexer { let operations = &operations[last_deletion.map_or(0, |i| i + 1)..]; if operations.is_empty() { - match original_obkv { + match original { Some(original_obkv) => { let current = original_obkv.as_bytes().to_vec().into_boxed_slice().into(); - return Ok(Some(DocumentChange::Deletion(document_change::Deletion::new( + return Ok(Some(DocumentChange::Deletion(document_change::Deletion::create( docid, external_docid, current, @@ -390,25 +395,38 @@ mod indexer { } for operation in operations { - let DocumentOffset { content, offset } = ; + let DocumentOffset { content, offset } = match operation { + DocumentOperation::Addition(offset) => offset, + DocumentOperation::Deletion => unreachable!("Deletion in document operations"), + }; let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); - let obkv = cursor.get(*offset)?.expect("must exists"); + let update = cursor.get(*offset)?.expect("must exists"); - obkv.into_iter().for_each(|(k, v)| { + update.into_iter().for_each(|(k, v)| { let field_name = batch_index.name(k).unwrap(); let id = fields_ids_map.id(field_name).unwrap(); - document.insert(id, v.to_vec()); + document.insert(id, v.to_vec().into()); }); } let mut writer = KvWriterFieldId::memory(); document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); - let boxed = writer.into_inner().unwrap().into_boxed_slice(); + /// TODO create a function for this conversion + let new = writer.into_inner().unwrap().into_boxed_slice().into(); - // Box - - Ok(boxed.into()) + match original { + Some(original) => { + /// TODO create a function for this conversion + let current = original.as_bytes().to_vec().into_boxed_slice().into(); + let update = Update::create(docid, external_docid, current, new); + Ok(Some(DocumentChange::Update(update))) + } + None => { + let insertion = Insertion::create(docid, external_docid, new); + Ok(Some(DocumentChange::Insertion(insertion))) + } + } } } From e6ffa4d45447145dac78b560ca7fbef1562c951a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 29 Aug 2024 14:08:31 +0200 Subject: [PATCH 003/247] Implement the document merge function for the replace method --- milli/src/update/new/mod.rs | 128 ++++++++++++++++++++++++++---------- 1 file changed, 95 insertions(+), 33 deletions(-) diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index e5d376534..20266267d 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -19,7 +19,7 @@ mod indexer { use roaring::RoaringBitmap; use serde_json::Value; - use super::document_change::{self, DocumentChange, Insertion, Update}; + use super::document_change::{Deletion, DocumentChange, Insertion, Update}; use super::items_pool::ItemsPool; use crate::documents::{ obkv_to_object, DocumentIdExtractionError, DocumentsBatchReader, PrimaryKey, @@ -88,7 +88,7 @@ mod indexer { rtxn: &'a RoTxn, mut fields_ids_map: FieldsIdsMap, primary_key: &'a PrimaryKey<'a>, - ) -> Result + 'a> { + ) -> Result + 'a> { let documents_ids = index.documents_ids(rtxn)?; let mut available_docids = AvailableDocumentsIds::from_documents_ids(&documents_ids); let mut docids_version_offsets = HashMap::::new(); @@ -185,9 +185,16 @@ mod indexer { items, |context_pool, (external_docid, (internal_docid, operations))| { context_pool.with(|rtxn| match self.method { - IndexDocumentsMethod::ReplaceDocuments => todo!(), + IndexDocumentsMethod::ReplaceDocuments => merge_document_for_replacements( + rtxn, + index, + &fields_ids_map, + internal_docid, + external_docid, + &operations, + ), // TODO Remap the documents to match the db fields_ids_map - IndexDocumentsMethod::UpdateDocuments => merge_document_obkv_for_updates( + IndexDocumentsMethod::UpdateDocuments => merge_document_for_updates( rtxn, index, &fields_ids_map, @@ -282,13 +289,12 @@ mod indexer { index: &'a Index, fields: &'a FieldsIdsMap, primary_key: &'a PrimaryKey<'a>, - ) -> Result> + 'a> - { + ) -> Result> + 'a> { let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| { items.with(|rtxn| { - let document = index.document(rtxn, docid)?; - let external_docid = match primary_key.document_id(&document, fields)? { + let current = index.document(rtxn, docid)?; + let external_docid = match primary_key.document_id(¤t, fields)? { Ok(document_id) => Ok(document_id) as Result<_>, Err(_) => Err(InternalError::DocumentsError( crate::documents::Error::InvalidDocumentFormat, @@ -297,12 +303,8 @@ mod indexer { }?; /// TODO create a function for this - let document = document.as_bytes().to_vec().into_boxed_slice().into(); - Ok(DocumentChange::Deletion(document_change::Deletion::create( - docid, - external_docid, - document, - ))) + let current = current.as_bytes().to_vec().into_boxed_slice().into(); + Ok(DocumentChange::Deletion(Deletion::create(docid, external_docid, current))) }) })) } @@ -319,7 +321,7 @@ mod indexer { self, iter: I, index: &Index, - ) -> impl ParallelIterator + ) -> impl ParallelIterator where I: IntoIterator, { @@ -349,15 +351,13 @@ mod indexer { } pub struct UpdateByFunctionIndexer; - // DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))? /// Reads the previous version of a document from the database, the new versions /// in the grenad update files and merges them to generate a new boxed obkv. /// /// This function is only meant to be used when doing an update and not a replacement. - pub fn merge_document_obkv_for_updates( + pub fn merge_document_for_updates( rtxn: &RoTxn, - // Let's construct the new obkv in memory index: &Index, fields_ids_map: &FieldsIdsMap, docid: DocumentId, @@ -365,11 +365,11 @@ mod indexer { operations: &[DocumentOperation], ) -> Result> { let mut document = BTreeMap::<_, Cow<_>>::new(); - let original = index.documents.remap_data_type::().get(rtxn, &docid)?; - let original: Option<&KvReaderFieldId> = original.map(Into::into); + let current = index.documents.remap_data_type::().get(rtxn, &docid)?; + let current: Option<&KvReaderFieldId> = current.map(Into::into); - if let Some(original) = original { - original.into_iter().for_each(|(k, v)| { + if let Some(current) = current { + current.into_iter().for_each(|(k, v)| { document.insert(k, v.into()); }); } @@ -381,14 +381,12 @@ mod indexer { let operations = &operations[last_deletion.map_or(0, |i| i + 1)..]; if operations.is_empty() { - match original { - Some(original_obkv) => { - let current = original_obkv.as_bytes().to_vec().into_boxed_slice().into(); - return Ok(Some(DocumentChange::Deletion(document_change::Deletion::create( - docid, - external_docid, - current, - )))); + match current { + Some(current) => { + /// TODO create a function for this + let current = current.as_bytes().to_vec().into_boxed_slice().into(); + let deletion = Deletion::create(docid, external_docid, current); + return Ok(Some(DocumentChange::Deletion(deletion))); } None => return Ok(None), } @@ -416,10 +414,10 @@ mod indexer { /// TODO create a function for this conversion let new = writer.into_inner().unwrap().into_boxed_slice().into(); - match original { - Some(original) => { + match current { + Some(current) => { /// TODO create a function for this conversion - let current = original.as_bytes().to_vec().into_boxed_slice().into(); + let current = current.as_bytes().to_vec().into_boxed_slice().into(); let update = Update::create(docid, external_docid, current, new); Ok(Some(DocumentChange::Update(update))) } @@ -429,4 +427,68 @@ mod indexer { } } } + + /// Returns only the most recent version of a document based on the updates from the payloads. + /// + /// This function is only meant to be used when doing a replacement and not an update. + pub fn merge_document_for_replacements( + rtxn: &RoTxn, + index: &Index, + fields_ids_map: &FieldsIdsMap, + docid: DocumentId, + external_docid: String, + operations: &[DocumentOperation], + ) -> Result> { + let current = index.documents.remap_data_type::().get(rtxn, &docid)?; + let current: Option<&KvReaderFieldId> = current.map(Into::into); + + match operations.last() { + Some(DocumentOperation::Addition(DocumentOffset { content, offset })) => { + let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; + let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); + let update = cursor.get(*offset)?.expect("must exists"); + + let mut document_entries = Vec::new(); + update.into_iter().for_each(|(k, v)| { + let field_name = batch_index.name(k).unwrap(); + let id = fields_ids_map.id(field_name).unwrap(); + document_entries.push((id, v)); + }); + + document_entries.sort_unstable_by_key(|(id, _)| *id); + + let mut writer = KvWriterFieldId::memory(); + document_entries + .into_iter() + .for_each(|(id, value)| writer.insert(id, value).unwrap()); + /// TODO create a function for this conversion + let new = writer.into_inner().unwrap().into_boxed_slice().into(); + + match current { + Some(current) => { + /// TODO create a function for this conversion + let current = current.as_bytes().to_vec().into_boxed_slice().into(); + let update = Update::create(docid, external_docid, current, new); + Ok(Some(DocumentChange::Update(update))) + } + None => { + let insertion = Insertion::create(docid, external_docid, new); + Ok(Some(DocumentChange::Insertion(insertion))) + } + } + } + Some(DocumentOperation::Deletion) => { + match current { + Some(current) => { + /// TODO create a function for this conversion + let current = current.as_bytes().to_vec().into_boxed_slice().into(); + let deletion = Deletion::create(docid, external_docid, current); + Ok(Some(DocumentChange::Deletion(deletion))) + } + None => Ok(None), + } + } + None => Ok(None), + } + } } From 874c1ac538eea2406db3005eecf4a49249282b03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 29 Aug 2024 15:07:59 +0200 Subject: [PATCH 004/247] First channels types --- Cargo.lock | 2 +- milli/src/update/new/channel.rs | 93 +++++++++++++++++++++++++++++++++ milli/src/update/new/mod.rs | 9 +++- 3 files changed, 101 insertions(+), 3 deletions(-) create mode 100644 milli/src/update/new/channel.rs diff --git a/Cargo.lock b/Cargo.lock index c3e9532e2..a21cbc007 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3858,7 +3858,7 @@ checksum = "a2e27bcfe835a379d32352112f6b8dbae2d99d16a5fff42abe6e5ba5386c1e5a" [[package]] name = "obkv" version = "0.3.0" -source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#d248eb7edd3453ff758afc2883f6ae25684eb69e" +source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#5289a6658cd471f4212c1edc1a40b2a3c3d11fe0" [[package]] name = "once_cell" diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs new file mode 100644 index 000000000..0dd2d9935 --- /dev/null +++ b/milli/src/update/new/channel.rs @@ -0,0 +1,93 @@ +use crossbeam_channel::{Receiver, RecvError, SendError, Sender}; +use heed::types::Bytes; + +use super::indexer::KvReaderFieldId; +use super::StdResult; +use crate::{DocumentId, Index}; + +/// The capacity of the channel is currently in number of messages. +pub fn merge_writer_channel(cap: usize) -> WriterChannels { + let (sender, receiver) = crossbeam_channel::bounded(cap); + + WriterChannels { + writer_receiver: WriterReceiver(receiver), + merger_sender: MergerSender(sender.clone()), + document_sender: DocumentSender(sender), + } +} + +pub struct WriterChannels { + pub writer_receiver: WriterReceiver, + pub merger_sender: MergerSender, + pub document_sender: DocumentSender, +} + +pub struct KeyValueEntry { + pub key_length: u16, + pub data: Box<[u8]>, +} + +impl KeyValueEntry { + pub fn entry(&self) -> (&[u8], &[u8]) { + self.data.split_at(self.key_length as usize) + } +} + +pub struct DocumentEntry { + docid: DocumentId, + content: Box<[u8]>, +} + +impl DocumentEntry { + pub fn new_uncompressed(docid: DocumentId, content: Box) -> Self { + DocumentEntry { docid, content: content.into() } + } + + pub fn new_compressed(docid: DocumentId, content: Box<[u8]>) -> Self { + DocumentEntry { docid, content } + } + + pub fn entry(&self) -> ([u8; 4], &[u8]) { + let docid = self.docid.to_be_bytes(); + (docid, &self.content) + } +} + +pub enum WriterOperation { + WordDocIds(KeyValueEntry), + Document(DocumentEntry), +} + +impl WriterOperation { + pub fn database(&self, index: &Index) -> heed::Database { + match self { + WriterOperation::WordDocIds(_) => index.word_docids.remap_types(), + WriterOperation::Document(_) => index.documents.remap_types(), + } + } +} + +pub struct WriterReceiver(Receiver); + +impl WriterReceiver { + pub fn recv(&self) -> StdResult { + self.0.recv() + } +} + +pub struct MergerSender(Sender); + +#[derive(Clone)] +pub struct DocumentSender(Sender); + +impl DocumentSender { + pub fn send(&self, document: DocumentEntry) -> StdResult<(), SendError> { + match self.0.send(WriterOperation::Document(document)) { + Ok(()) => Ok(()), + Err(SendError(wop)) => match wop { + WriterOperation::Document(entry) => Err(SendError(entry)), + _ => unreachable!(), + }, + } + } +} diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 20266267d..726153c53 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -1,9 +1,12 @@ mod document_change; // mod extract; +mod channel; mod items_pool; mod global_fields_ids_map; +pub type StdResult = std::result::Result; + mod indexer { use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; @@ -352,11 +355,13 @@ mod indexer { pub struct UpdateByFunctionIndexer; + // fn + /// Reads the previous version of a document from the database, the new versions /// in the grenad update files and merges them to generate a new boxed obkv. /// /// This function is only meant to be used when doing an update and not a replacement. - pub fn merge_document_for_updates( + fn merge_document_for_updates( rtxn: &RoTxn, index: &Index, fields_ids_map: &FieldsIdsMap, @@ -431,7 +436,7 @@ mod indexer { /// Returns only the most recent version of a document based on the updates from the payloads. /// /// This function is only meant to be used when doing a replacement and not an update. - pub fn merge_document_for_replacements( + fn merge_document_for_replacements( rtxn: &RoTxn, index: &Index, fields_ids_map: &FieldsIdsMap, From 45c060831ed2680fedd08bc50467292d3eb2776d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 29 Aug 2024 17:51:42 +0200 Subject: [PATCH 005/247] Introduce typed channels and the merger loop --- milli/src/update/del_add.rs | 18 +++ milli/src/update/new/channel.rs | 113 +++++++++++++++--- .../merge/del_add_roaring_bitmap_merger.rs | 61 ++++++++++ milli/src/update/new/merge/mod.rs | 3 + milli/src/update/new/mod.rs | 106 +++++++++++++++- 5 files changed, 285 insertions(+), 16 deletions(-) create mode 100644 milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs create mode 100644 milli/src/update/new/merge/mod.rs diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index 570d292ef..790cdd028 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -30,6 +30,24 @@ impl Key for DelAdd { } } +// TODO remove this implementation +impl obkv2::Key for DelAdd { + const BYTES_SIZE: usize = std::mem::size_of::(); + type BYTES = [u8; ::BYTES_SIZE]; + + fn to_be_bytes(&self) -> Self::BYTES { + u8::to_be_bytes(*self as u8) + } + + fn from_be_bytes(array: Self::BYTES) -> Self { + match u8::from_be_bytes(array) { + 0 => Self::Deletion, + 1 => Self::Addition, + otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise), + } + } +} + /// Creates a Kv> from Kv /// /// Deletion: put all the values under DelAdd::Deletion diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 0dd2d9935..4123e568c 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -1,3 +1,5 @@ +use std::fs::File; + use crossbeam_channel::{Receiver, RecvError, SendError, Sender}; use heed::types::Bytes; @@ -6,31 +8,73 @@ use super::StdResult; use crate::{DocumentId, Index}; /// The capacity of the channel is currently in number of messages. -pub fn merge_writer_channel(cap: usize) -> WriterChannels { +pub fn merger_writer_channels(cap: usize) -> MergerWriterChannels { let (sender, receiver) = crossbeam_channel::bounded(cap); - WriterChannels { + MergerWriterChannels { writer_receiver: WriterReceiver(receiver), merger_sender: MergerSender(sender.clone()), document_sender: DocumentSender(sender), } } -pub struct WriterChannels { +pub struct MergerWriterChannels { pub writer_receiver: WriterReceiver, pub merger_sender: MergerSender, pub document_sender: DocumentSender, } +/// The capacity of the channel is currently in number of messages. +pub fn extractors_merger_channels(cap: usize) -> ExtractorsMergerChannels { + let (sender, receiver) = crossbeam_channel::bounded(cap); + + ExtractorsMergerChannels { + merger_receiver: MergerReceiver(receiver), + deladd_cbo_roaring_bitmap_sender: DeladdCboRoaringBitmapSender(sender.clone()), + } +} + +pub struct ExtractorsMergerChannels { + pub merger_receiver: MergerReceiver, + pub deladd_cbo_roaring_bitmap_sender: DeladdCboRoaringBitmapSender, +} + pub struct KeyValueEntry { - pub key_length: u16, - pub data: Box<[u8]>, + key_length: usize, + data: Box<[u8]>, } impl KeyValueEntry { - pub fn entry(&self) -> (&[u8], &[u8]) { - self.data.split_at(self.key_length as usize) + pub fn from_key_value(key: &[u8], value: &[u8]) -> Self { + let mut data = Vec::with_capacity(key.len() + value.len()); + data.extend_from_slice(key); + data.extend_from_slice(value); + + KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() } } + + pub fn entry(&self) -> (&[u8], &[u8]) { + self.data.split_at(self.key_length) + } +} + +pub struct KeyEntry { + data: Box<[u8]>, +} + +impl KeyEntry { + pub fn from_key(key: &[u8]) -> Self { + KeyEntry { data: key.to_vec().into_boxed_slice() } + } + + pub fn entry(&self) -> &[u8] { + self.data.as_ref() + } +} + +enum EntryOperation { + Delete(KeyEntry), + Write(KeyValueEntry), } pub struct DocumentEntry { @@ -54,14 +98,14 @@ impl DocumentEntry { } pub enum WriterOperation { - WordDocIds(KeyValueEntry), + WordDocids(EntryOperation), Document(DocumentEntry), } impl WriterOperation { pub fn database(&self, index: &Index) -> heed::Database { match self { - WriterOperation::WordDocIds(_) => index.word_docids.remap_types(), + WriterOperation::WordDocids(_) => index.word_docids.remap_types(), WriterOperation::Document(_) => index.documents.remap_types(), } } @@ -77,17 +121,58 @@ impl WriterReceiver { pub struct MergerSender(Sender); +impl MergerSender { + pub fn word_docids(&self) -> WordDocidsSender<'_> { + WordDocidsSender(&self.0) + } +} + +pub struct WordDocidsSender<'a>(&'a Sender); + +impl WordDocidsSender<'_> { + pub fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { + let operation = EntryOperation::Write(KeyValueEntry::from_key_value(key, value)); + match self.0.send(WriterOperation::WordDocids(operation)) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + let operation = EntryOperation::Delete(KeyEntry::from_key(key)); + match self.0.send(WriterOperation::WordDocids(operation)) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } +} + #[derive(Clone)] pub struct DocumentSender(Sender); impl DocumentSender { - pub fn send(&self, document: DocumentEntry) -> StdResult<(), SendError> { + pub fn send(&self, document: DocumentEntry) -> StdResult<(), SendError<()>> { match self.0.send(WriterOperation::Document(document)) { Ok(()) => Ok(()), - Err(SendError(wop)) => match wop { - WriterOperation::Document(entry) => Err(SendError(entry)), - _ => unreachable!(), - }, + Err(SendError(_)) => Err(SendError(())), } } } + +pub enum MergerOperation { + WordDocidsCursors(Vec>), +} + +pub struct MergerReceiver(Receiver); + +impl IntoIterator for MergerReceiver { + type Item = MergerOperation; + type IntoIter = crossbeam_channel::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} + +#[derive(Clone)] +pub struct DeladdCboRoaringBitmapSender(Sender); diff --git a/milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs b/milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs new file mode 100644 index 000000000..5e6310170 --- /dev/null +++ b/milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs @@ -0,0 +1,61 @@ +use std::borrow::Cow; +use std::io; + +use grenad2::MergeFunction; +use roaring::RoaringBitmap; + +use crate::update::del_add::DelAdd; +use crate::update::new::indexer::{KvReaderDelAdd, KvWriterDelAdd}; + +/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv +/// separately and outputs a new DelAdd with both unions. +pub struct DelAddRoaringBitmapMerger; + +impl MergeFunction for DelAddRoaringBitmapMerger { + type Error = io::Error; + + fn merge<'a>( + &self, + _key: &[u8], + values: &[Cow<'a, [u8]>], + ) -> std::result::Result, Self::Error> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // Retrieve the bitmaps from both sides + let mut del_bitmaps_bytes = Vec::new(); + let mut add_bitmaps_bytes = Vec::new(); + for value in values { + let obkv: &KvReaderDelAdd = value.as_ref().into(); + if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { + del_bitmaps_bytes.push(bitmap_bytes); + } + if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { + add_bitmaps_bytes.push(bitmap_bytes); + } + } + + let mut output_deladd_obkv = KvWriterDelAdd::memory(); + + // Deletion + let mut buffer = Vec::new(); + let mut merged = RoaringBitmap::new(); + for bytes in del_bitmaps_bytes { + merged |= RoaringBitmap::deserialize_unchecked_from(bytes)?; + } + merged.serialize_into(&mut buffer)?; + output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; + + // Addition + buffer.clear(); + merged.clear(); + for bytes in add_bitmaps_bytes { + merged |= RoaringBitmap::deserialize_unchecked_from(bytes)?; + } + merged.serialize_into(&mut buffer)?; + output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; + + output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) + } + } +} diff --git a/milli/src/update/new/merge/mod.rs b/milli/src/update/new/merge/mod.rs new file mode 100644 index 000000000..6057b8d89 --- /dev/null +++ b/milli/src/update/new/merge/mod.rs @@ -0,0 +1,3 @@ +mod del_add_roaring_bitmap_merger; + +pub use del_add_roaring_bitmap_merger::DelAddRoaringBitmapMerger; diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 726153c53..24e9c95db 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -2,6 +2,7 @@ mod document_change; // mod extract; mod channel; mod items_pool; +mod merge; mod global_fields_ids_map; @@ -22,18 +23,25 @@ mod indexer { use roaring::RoaringBitmap; use serde_json::Value; + use super::channel::{MergerReceiver, MergerSender}; use super::document_change::{Deletion, DocumentChange, Insertion, Update}; use super::items_pool::ItemsPool; + use super::merge; use crate::documents::{ obkv_to_object, DocumentIdExtractionError, DocumentsBatchReader, PrimaryKey, }; + use crate::update::del_add::DelAdd; + use crate::update::new::channel::MergerOperation; use crate::update::{AvailableDocumentsIds, IndexDocumentsMethod}; use crate::{ - DocumentId, Error, FieldId, FieldsIdsMap, Index, InternalError, Result, UserError, + CboRoaringBitmapCodec, DocumentId, Error, FieldId, FieldsIdsMap, Index, InternalError, + Result, UserError, }; pub type KvReaderFieldId = obkv2::KvReader; + pub type KvReaderDelAdd = obkv2::KvReader; pub type KvWriterFieldId = obkv2::KvWriter; + pub type KvWriterDelAdd = obkv2::KvWriter; pub struct DocumentOperationIndexer { operations: Vec, @@ -355,7 +363,101 @@ mod indexer { pub struct UpdateByFunctionIndexer; - // fn + enum Operation { + Write(RoaringBitmap), + Delete, + Ignore, + } + + /// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap. + fn merge_cbo_bitmaps( + current: Option<&[u8]>, + del: Option<&[u8]>, + add: Option<&[u8]>, + ) -> Result { + let bitmap = match current { + Some(current_bitmap_bytes) => { + let bitmap_without_del = match del { + Some(del_bytes) => { + let del_bitmap = CboRoaringBitmapCodec::deserialize_from(del_bytes)?; + CboRoaringBitmapCodec::intersection_with_serialized( + current_bitmap_bytes, + &del_bitmap, + )? + } + None => CboRoaringBitmapCodec::deserialize_from(current_bitmap_bytes)?, + }; + + match add { + Some(add_bytes) => { + let add = CboRoaringBitmapCodec::deserialize_from(add_bytes)?; + bitmap_without_del | add + } + None => bitmap_without_del, + } + } + None => match add { + Some(add_bytes) => CboRoaringBitmapCodec::deserialize_from(add_bytes)?, + None => return Ok(Operation::Ignore), + }, + }; + + if bitmap.is_empty() { + Ok(Operation::Delete) + } else { + Ok(Operation::Write(bitmap)) + } + } + + /// Return the slice directly from the serialize_into method + fn cbo_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) -> &'b [u8] { + buffer.clear(); + CboRoaringBitmapCodec::serialize_into(bitmap, buffer); + buffer.as_slice() + } + + /// TODO We must return some infos/stats + fn merge_grenad_entries( + receiver: MergerReceiver, + sender: MergerSender, + rtxn: &RoTxn, + index: &Index, + ) -> Result<()> { + let mut buffer = Vec::new(); + + for merger_operation in receiver { + match merger_operation { + MergerOperation::WordDocidsCursors(cursors) => { + let sender = sender.word_docids(); + let database = index.word_docids.remap_types::(); + + let mut builder = grenad2::MergerBuilder::new(merge::DelAddRoaringBitmapMerger); + builder.extend(cursors); + /// TODO manage the error correctly + let mut merger_iter = builder.build().into_stream_merger_iter().unwrap(); + + // TODO manage the error correctly + while let Some((key, deladd)) = merger_iter.next().unwrap() { + let current = database.get(rtxn, key)?; + let deladd: &KvReaderDelAdd = deladd.into(); + let del = deladd.get(DelAdd::Deletion); + let add = deladd.get(DelAdd::Addition); + + match merge_cbo_bitmaps(current, del, add)? { + Operation::Write(bitmap) => { + let value = cbo_serialize_into_vec(&bitmap, &mut buffer); + sender.write(key, value).unwrap(); + } + Operation::Delete => sender.delete(key).unwrap(), + Operation::Ignore => (), + } + } + } + } + } + + Ok(()) + } /// Reads the previous version of a document from the database, the new versions /// in the grenad update files and merges them to generate a new boxed obkv. From 27df9e6c731ce00007fe3bed7cc75ffb9ef4ffde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 29 Aug 2024 18:27:02 +0200 Subject: [PATCH 006/247] Introduce the indexer::index function that runs the indexation --- milli/src/update/new/channel.rs | 48 ++++++++++++++++----------------- milli/src/update/new/mod.rs | 48 +++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 26 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 4123e568c..6780be72e 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -1,6 +1,7 @@ +use core::slice::SlicePattern; use std::fs::File; -use crossbeam_channel::{Receiver, RecvError, SendError, Sender}; +use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use heed::types::Bytes; use super::indexer::KvReaderFieldId; @@ -8,20 +9,9 @@ use super::StdResult; use crate::{DocumentId, Index}; /// The capacity of the channel is currently in number of messages. -pub fn merger_writer_channels(cap: usize) -> MergerWriterChannels { +pub fn merger_writer_channels(cap: usize) -> (MergerSender, WriterReceiver) { let (sender, receiver) = crossbeam_channel::bounded(cap); - - MergerWriterChannels { - writer_receiver: WriterReceiver(receiver), - merger_sender: MergerSender(sender.clone()), - document_sender: DocumentSender(sender), - } -} - -pub struct MergerWriterChannels { - pub writer_receiver: WriterReceiver, - pub merger_sender: MergerSender, - pub document_sender: DocumentSender, + (MergerSender(sender), WriterReceiver(receiver)) } /// The capacity of the channel is currently in number of messages. @@ -53,8 +43,12 @@ impl KeyValueEntry { KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() } } - pub fn entry(&self) -> (&[u8], &[u8]) { - self.data.split_at(self.key_length) + pub fn key(&self) -> &[u8] { + &self.data.as_slice()[..self.key_length] + } + + pub fn value(&self) -> &[u8] { + &self.data.as_slice()[self.key_length..] } } @@ -72,7 +66,7 @@ impl KeyEntry { } } -enum EntryOperation { +pub enum EntryOperation { Delete(KeyEntry), Write(KeyValueEntry), } @@ -91,9 +85,12 @@ impl DocumentEntry { DocumentEntry { docid, content } } - pub fn entry(&self) -> ([u8; 4], &[u8]) { - let docid = self.docid.to_be_bytes(); - (docid, &self.content) + pub fn key(&self) -> [u8; 4] { + self.docid.to_be_bytes() + } + + pub fn content(&self) -> &[u8] { + &self.content } } @@ -113,9 +110,12 @@ impl WriterOperation { pub struct WriterReceiver(Receiver); -impl WriterReceiver { - pub fn recv(&self) -> StdResult { - self.0.recv() +impl IntoIterator for WriterReceiver { + type Item = WriterOperation; + type IntoIter = IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() } } @@ -167,7 +167,7 @@ pub struct MergerReceiver(Receiver); impl IntoIterator for MergerReceiver { type Item = MergerOperation; - type IntoIter = crossbeam_channel::IntoIter; + type IntoIter = IntoIter; fn into_iter(self) -> Self::IntoIter { self.0.into_iter() diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 24e9c95db..da76bdfee 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -15,15 +15,21 @@ mod indexer { use std::io::Cursor; use std::os::unix::fs::MetadataExt; use std::sync::Arc; + use std::thread; + use big_s::S; use heed::types::Bytes; - use heed::RoTxn; + use heed::{RoTxn, RwTxn}; use memmap2::Mmap; use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; use roaring::RoaringBitmap; use serde_json::Value; - use super::channel::{MergerReceiver, MergerSender}; + use super::channel::{ + extractors_merger_channels, merger_writer_channels, EntryOperation, + ExtractorsMergerChannels, MergerReceiver, MergerSender, MergerWriterChannels, + WriterOperation, + }; use super::document_change::{Deletion, DocumentChange, Insertion, Update}; use super::items_pool::ItemsPool; use super::merge; @@ -363,6 +369,44 @@ mod indexer { pub struct UpdateByFunctionIndexer; + /// TODO return stats + /// TODO take the rayon ThreadPool + pub fn index(wtxn: &mut RwTxn, index: &Index, document_changes: PI) -> Result<()> + where + PI: IntoParallelIterator> + Send, + PI::Iter: Clone, + { + let (merger_sender, writer_receiver) = merger_writer_channels(100); + let ExtractorsMergerChannels { merger_receiver, deladd_cbo_roaring_bitmap_sender } = + extractors_merger_channels(100); + + thread::scope(|s| { + thread::Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { + document_changes.into_par_iter().for_each(|_dc| ()); + }); + + // TODO manage the errors correctly + thread::Builder::new().name(S("indexer-merger")).spawn_scoped(s, || { + let rtxn = index.read_txn().unwrap(); + merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index).unwrap() + }); + + // TODO Split this code into another function + for operation in writer_receiver { + let database = operation.database(index); + match operation { + WriterOperation::WordDocids(operation) => match operation { + EntryOperation::Delete(e) => database.delete(wtxn, e.entry()).map(drop)?, + EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, + }, + WriterOperation::Document(e) => database.put(wtxn, &e.key(), e.content())?, + } + } + + Ok(()) + }) + } + enum Operation { Write(RoaringBitmap), Delete, From 0c57cf7565c836e559cce63f03b77450207eb26f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 29 Aug 2024 19:20:10 +0200 Subject: [PATCH 007/247] Replace obkv with the temporary new version of it --- Cargo.lock | 13 +--- meilisearch/Cargo.toml | 2 +- meilisearch/src/search/mod.rs | 2 +- milli/Cargo.toml | 5 +- milli/src/documents/enriched.rs | 2 +- milli/src/documents/mod.rs | 4 +- milli/src/documents/primary_key.rs | 2 +- milli/src/documents/reader.rs | 12 ++-- milli/src/heed_codec/obkv_codec.rs | 4 +- .../cbo_roaring_bitmap_codec.rs | 2 +- milli/src/index.rs | 8 +-- milli/src/lib.rs | 7 +- milli/src/prompt/document.rs | 4 +- milli/src/prompt/mod.rs | 2 +- milli/src/update/del_add.rs | 32 ++------- milli/src/update/facet/bulk.rs | 4 +- milli/src/update/facet/incremental.rs | 2 +- milli/src/update/facet/mod.rs | 2 +- milli/src/update/index_documents/enrich.rs | 2 +- .../extract/extract_docid_word_positions.rs | 16 ++--- .../extract/extract_facet_number_docids.rs | 2 +- .../extract/extract_facet_string_docids.rs | 4 +- .../extract/extract_fid_docid_facet_values.rs | 4 +- .../extract/extract_fid_word_count_docids.rs | 10 ++- .../extract/extract_geo_points.rs | 12 ++-- .../extract/extract_vector_points.rs | 14 ++-- .../extract/extract_word_docids.rs | 10 +-- .../extract_word_pair_proximity_docids.rs | 8 +-- .../extract/extract_word_position_docids.rs | 6 +- .../helpers/merge_functions.rs | 20 +++--- milli/src/update/index_documents/parallel.rs | 4 +- milli/src/update/index_documents/transform.rs | 42 +++++------ .../src/update/index_documents/typed_chunk.rs | 14 ++-- milli/src/update/new/channel.rs | 5 +- milli/src/update/new/document_change.rs | 2 +- milli/src/update/new/mod.rs | 71 +++---------------- milli/src/vector/parsed_vectors.rs | 10 ++- 37 files changed, 142 insertions(+), 223 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a21cbc007..e0effa54d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3434,7 +3434,7 @@ dependencies = [ "mimalloc", "mime", "num_cpus", - "obkv 0.2.2", + "obkv", "once_cell", "ordered-float", "parking_lot", @@ -3601,8 +3601,7 @@ dependencies = [ "memchr", "memmap2", "mimalloc", - "obkv 0.2.2", - "obkv 0.3.0", + "obkv", "once_cell", "ordered-float", "rand", @@ -3849,16 +3848,10 @@ dependencies = [ "memchr", ] -[[package]] -name = "obkv" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2e27bcfe835a379d32352112f6b8dbae2d99d16a5fff42abe6e5ba5386c1e5a" - [[package]] name = "obkv" version = "0.3.0" -source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#5289a6658cd471f4212c1edc1a40b2a3c3d11fe0" +source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#9c2900d106fa84e7079b288e7f7c366ec7cae948" [[package]] name = "once_cell" diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index e614ecc6a..041d5d871 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -57,7 +57,7 @@ meilisearch-types = { path = "../meilisearch-types" } mimalloc = { version = "0.1.43", default-features = false } mime = "0.3.17" num_cpus = "1.16.0" -obkv = "0.2.2" +obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } once_cell = "1.19.0" ordered-float = "4.2.1" parking_lot = "0.12.3" diff --git a/meilisearch/src/search/mod.rs b/meilisearch/src/search/mod.rs index 915505be0..4ada47ff1 100644 --- a/meilisearch/src/search/mod.rs +++ b/meilisearch/src/search/mod.rs @@ -1247,7 +1247,7 @@ impl<'a> HitMaker<'a> { self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?; // First generate a document with all the displayed fields - let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?; + let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, &obkv)?; let add_vectors_fid = self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve); diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 9fa270d46..b15f72f15 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -12,6 +12,7 @@ readme.workspace = true license.workspace = true [dependencies] +big_s = "1.0.2" bimap = { version = "0.6.3", features = ["serde"] } bincode = "1.3.3" bstr = "1.9.1" @@ -44,8 +45,7 @@ levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } lru = "0.12.3" memchr = "2.5.0" memmap2 = "0.9.4" -obkv = "0.2.2" -obkv2 = { package = "obkv", git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } +obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } once_cell = "1.19.0" ordered-float = "4.2.1" rayon = "1.10.0" @@ -94,7 +94,6 @@ rayon-par-bridge = "0.1.0" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } -big_s = "1.0.2" insta = "1.39.0" maplit = "1.0.2" md5 = "0.7.0" diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs index 609765068..cede4d2f0 100644 --- a/milli/src/documents/enriched.rs +++ b/milli/src/documents/enriched.rs @@ -69,7 +69,7 @@ impl EnrichedDocumentsBatchReader { #[derive(Debug, Clone)] pub struct EnrichedDocument<'a> { - pub document: KvReader<'a, FieldId>, + pub document: &'a KvReader, pub document_id: DocumentId, } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index f4509256d..036981b65 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -27,7 +27,7 @@ use crate::{FieldId, Object, Result}; const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes(); /// Helper function to convert an obkv reader into a JSON object. -pub fn obkv_to_object(obkv: &KvReader<'_, FieldId>, index: &DocumentsBatchIndex) -> Result { +pub fn obkv_to_object(obkv: &KvReader, index: &DocumentsBatchIndex) -> Result { obkv.iter() .map(|(field_id, value)| { let field_name = index @@ -76,7 +76,7 @@ impl DocumentsBatchIndex { self.0.get_by_right(name).cloned() } - pub fn recreate_json(&self, document: &obkv::KvReaderU16<'_>) -> Result { + pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result { let mut map = Object::new(); for (k, v) in document.iter() { diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs index 64131af40..22918f8fc 100644 --- a/milli/src/documents/primary_key.rs +++ b/milli/src/documents/primary_key.rs @@ -52,7 +52,7 @@ impl<'a> PrimaryKey<'a> { pub fn document_id( &self, - document: &obkv::KvReader<'_, FieldId>, + document: &obkv::KvReader, fields: &impl FieldIdMapper, ) -> Result> { match self { diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index ebdc514fd..20e932805 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -76,11 +76,9 @@ impl DocumentsBatchCursor { pub fn get( &mut self, offset: u32, - ) -> Result>, DocumentsBatchCursorError> { + ) -> Result>, DocumentsBatchCursorError> { match self.cursor.move_on_key_equal_to(offset.to_be_bytes())? { - Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { - Ok(Some(KvReader::new(value))) - } + Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => Ok(Some(value.into())), _otherwise => Ok(None), } } @@ -89,11 +87,9 @@ impl DocumentsBatchCursor { /// `next_document` advance the document reader until all the documents have been read. pub fn next_document( &mut self, - ) -> Result>, DocumentsBatchCursorError> { + ) -> Result>, DocumentsBatchCursorError> { match self.cursor.move_on_next()? { - Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { - Ok(Some(KvReader::new(value))) - } + Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => Ok(Some(value.into())), _otherwise => Ok(None), } } diff --git a/milli/src/heed_codec/obkv_codec.rs b/milli/src/heed_codec/obkv_codec.rs index 390a57af3..447323571 100644 --- a/milli/src/heed_codec/obkv_codec.rs +++ b/milli/src/heed_codec/obkv_codec.rs @@ -6,10 +6,10 @@ use obkv::{KvReaderU16, KvWriterU16}; pub struct ObkvCodec; impl<'a> heed::BytesDecode<'a> for ObkvCodec { - type DItem = KvReaderU16<'a>; + type DItem = &'a KvReaderU16; fn bytes_decode(bytes: &'a [u8]) -> Result { - Ok(KvReaderU16::new(bytes)) + Ok(KvReaderU16::from_slice(bytes)) } } diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index fa65d5217..257d5bd0a 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -122,7 +122,7 @@ impl CboRoaringBitmapCodec { /// Merges a DelAdd delta into a CboRoaringBitmap. pub fn merge_deladd_into<'a>( - deladd: KvReaderDelAdd<'_>, + deladd: &KvReaderDelAdd, previous: &[u8], buffer: &'a mut Vec, ) -> io::Result> { diff --git a/milli/src/index.rs b/milli/src/index.rs index 5d651e144..9c582b97a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1252,7 +1252,7 @@ impl Index { /* documents */ /// Returns a document by using the document id. - pub fn document<'t>(&self, rtxn: &'t RoTxn, id: DocumentId) -> Result> { + pub fn document<'t>(&self, rtxn: &'t RoTxn, id: DocumentId) -> Result<&'t obkv::KvReaderU16> { self.documents .get(rtxn, &id)? .ok_or(UserError::UnknownInternalDocumentId { document_id: id }) @@ -1264,7 +1264,7 @@ impl Index { &'a self, rtxn: &'t RoTxn<'t>, ids: impl IntoIterator + 'a, - ) -> Result)>> + 'a> { + ) -> Result> + 'a> { Ok(ids.into_iter().map(move |id| { let kv = self .documents @@ -1279,7 +1279,7 @@ impl Index { &self, rtxn: &'t RoTxn<'t>, ids: impl IntoIterator, - ) -> Result)>> { + ) -> Result> { self.iter_documents(rtxn, ids)?.collect() } @@ -1287,7 +1287,7 @@ impl Index { pub fn all_documents<'a, 't: 'a>( &'a self, rtxn: &'t RoTxn<'t>, - ) -> Result)>> + 'a> { + ) -> Result> + 'a> { self.iter_documents(rtxn, self.documents_ids(rtxn)?) } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 8008b7bd1..bb8325791 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -214,7 +214,7 @@ pub fn bucketed_position(relative: u16) -> u16 { pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, - obkv: obkv::KvReaderU16<'_>, + obkv: &obkv::KvReaderU16, ) -> Result { displayed_fields .iter() @@ -232,10 +232,7 @@ pub fn obkv_to_json( } /// Transform every field of a raw obkv store into a JSON Object. -pub fn all_obkv_to_json( - obkv: obkv::KvReaderU16<'_>, - fields_ids_map: &FieldsIdsMap, -) -> Result { +pub fn all_obkv_to_json(obkv: &obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result { let all_keys = obkv.iter().map(|(k, _v)| k).collect::>(); obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv) } diff --git a/milli/src/prompt/document.rs b/milli/src/prompt/document.rs index b5d43b5be..a809f58ce 100644 --- a/milli/src/prompt/document.rs +++ b/milli/src/prompt/document.rs @@ -30,13 +30,13 @@ impl ParsedValue { impl<'a> Document<'a> { pub fn new( - data: obkv::KvReaderU16<'a>, + data: &'a obkv::KvReaderU16, side: DelAdd, inverted_field_map: &'a FieldsIdsMap, ) -> Self { let mut out_data = BTreeMap::new(); for (fid, raw) in data { - let obkv = KvReaderDelAdd::new(raw); + let obkv = KvReaderDelAdd::from_slice(raw); let Some(raw) = obkv.get(side) else { continue; }; diff --git a/milli/src/prompt/mod.rs b/milli/src/prompt/mod.rs index 97ccbfb61..79e4eabbb 100644 --- a/milli/src/prompt/mod.rs +++ b/milli/src/prompt/mod.rs @@ -91,7 +91,7 @@ impl Prompt { pub fn render( &self, - document: obkv::KvReaderU16<'_>, + document: &obkv::KvReaderU16, side: DelAdd, field_id_map: &FieldsIdsMap, ) -> Result { diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index 790cdd028..97ff86f2a 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -1,7 +1,7 @@ use obkv::Key; pub type KvWriterDelAdd = obkv::KvWriter; -pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>; +pub type KvReaderDelAdd = obkv::KvReader; /// DelAdd defines the new value to add in the database and old value to delete from the database. /// @@ -30,31 +30,13 @@ impl Key for DelAdd { } } -// TODO remove this implementation -impl obkv2::Key for DelAdd { - const BYTES_SIZE: usize = std::mem::size_of::(); - type BYTES = [u8; ::BYTES_SIZE]; - - fn to_be_bytes(&self) -> Self::BYTES { - u8::to_be_bytes(*self as u8) - } - - fn from_be_bytes(array: Self::BYTES) -> Self { - match u8::from_be_bytes(array) { - 0 => Self::Deletion, - 1 => Self::Addition, - otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise), - } - } -} - /// Creates a Kv> from Kv /// /// Deletion: put all the values under DelAdd::Deletion /// Addition: put all the values under DelAdd::Addition, /// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition, pub fn into_del_add_obkv( - reader: obkv::KvReader<'_, K>, + reader: &obkv::KvReader, operation: DelAddOperation, buffer: &mut Vec, ) -> Result<(), std::io::Error> { @@ -64,7 +46,7 @@ pub fn into_del_add_obkv( /// Akin to the [into_del_add_obkv] function but lets you /// conditionally define the `DelAdd` variant based on the obkv key. pub fn into_del_add_obkv_conditional_operation( - reader: obkv::KvReader<'_, K>, + reader: &obkv::KvReader, buffer: &mut Vec, operation: F, ) -> std::io::Result<()> @@ -104,8 +86,8 @@ pub enum DelAddOperation { /// putting each deletion obkv's keys under an DelAdd::Deletion /// and putting each addition obkv's keys under an DelAdd::Addition pub fn del_add_from_two_obkvs( - deletion: &obkv::KvReader<'_, K>, - addition: &obkv::KvReader<'_, K>, + deletion: &obkv::KvReader, + addition: &obkv::KvReader, buffer: &mut Vec, ) -> Result<(), std::io::Error> { use itertools::merge_join_by; @@ -139,7 +121,7 @@ pub fn del_add_from_two_obkvs( writer.finish() } -pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd<'_>) -> bool { +pub fn is_noop_del_add_obkv(del_add: &KvReaderDelAdd) -> bool { del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition) } @@ -154,5 +136,5 @@ pub fn deladd_serialize_add_side<'a>( obkv: &'a [u8], _buffer: &mut Vec, ) -> crate::Result<&'a [u8]> { - Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) + Ok(KvReaderDelAdd::from_slice(obkv).get(DelAdd::Addition).unwrap_or_default()) } diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index a63d59693..27de6e777 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -135,7 +135,7 @@ impl FacetsUpdateBulkInner { if !valid_lmdb_key(key) { continue; } - let value = KvReaderDelAdd::new(value); + let value = KvReaderDelAdd::from_slice(value); // DB is empty, it is safe to ignore Del operations let Some(value) = value.get(DelAdd::Addition) else { @@ -161,7 +161,7 @@ impl FacetsUpdateBulkInner { continue; } - let value = KvReaderDelAdd::new(value); + let value = KvReaderDelAdd::from_slice(value); // the value is a CboRoaringBitmap, but I still need to prepend the // group size for level 0 (= 1) to it diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 0f0937855..637f84986 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -109,7 +109,7 @@ impl FacetsUpdateIncremental { } current_field_id = Some(key.field_id); - let value = KvReader::new(value); + let value = KvReader::from_slice(value); let docids_to_delete = value .get(DelAdd::Deletion) .map(CboRoaringBitmapCodec::bytes_decode) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index ad3ddc38f..bccfdff12 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -187,7 +187,7 @@ fn index_facet_search( ) -> Result<()> { let mut iter = normalized_delta_data.into_stream_merger_iter()?; while let Some((key_bytes, delta_bytes)) = iter.next()? { - let deladd_reader = KvReaderDelAdd::new(delta_bytes); + let deladd_reader = KvReaderDelAdd::from_slice(delta_bytes); let database_set = index .facet_id_normalized_string_strings diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 691b2b9d1..a93d6f9f1 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -145,7 +145,7 @@ pub fn enrich_documents_batch( #[tracing::instrument(level = "trace", skip(uuid_buffer, documents_batch_index, document) target = "indexing::documents")] fn fetch_or_generate_document_id( - document: &obkv::KvReader<'_, FieldId>, + document: &obkv::KvReader, documents_batch_index: &DocumentsBatchIndex, primary_key: PrimaryKey<'_>, autogenerate_docids: bool, diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ba11ceeb3..a939827d5 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -80,7 +80,7 @@ pub fn extract_docid_word_positions( .try_into() .map(u32::from_be_bytes) .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let obkv = KvReader::::new(value); + let obkv = KvReader::::from_slice(value); // if the searchable fields didn't change, skip the searchable indexing for this document. if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) { @@ -126,13 +126,13 @@ pub fn extract_docid_word_positions( // transforming two KV> into one KV>> value_buffer.clear(); del_add_from_two_obkvs( - &KvReader::::new(del_obkv), - &KvReader::::new(add_obkv), + &KvReader::::from_slice(del_obkv), + &KvReader::::from_slice(add_obkv), &mut value_buffer, )?; // write each KV> into the sorter, field by field. - let obkv = KvReader::::new(&value_buffer); + let obkv = KvReader::::from_slice(&value_buffer); for (field_id, value) in obkv.iter() { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&field_id.to_be_bytes()); @@ -146,13 +146,13 @@ pub fn extract_docid_word_positions( /// Check if any searchable fields of a document changed. fn searchable_fields_changed( - obkv: &KvReader<'_, FieldId>, + obkv: &KvReader, settings_diff: &InnerIndexSettingsDiff, ) -> bool { let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { if searchable_fields.contains(&field_id) { - let del_add = KvReaderDelAdd::new(field_bytes); + let del_add = KvReaderDelAdd::from_slice(field_bytes); match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { // if both fields are None, check the next field. (None, None) => (), @@ -189,7 +189,7 @@ fn tokenizer_builder<'a>( /// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( - obkv: &KvReader<'a, FieldId>, + obkv: &'a KvReader, settings: &InnerIndexSettings, tokenizer: &Tokenizer<'_>, max_positions_per_attributes: u32, @@ -202,7 +202,7 @@ fn tokens_from_document<'a>( // if field is searchable. if settings.searchable_fields_ids.contains(&field_id) { // extract deletion or addition only. - if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { + if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) { // parse json. let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index bfd769604..478631dea 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -45,7 +45,7 @@ pub fn extract_facet_number_docids( buffer.clear(); let mut obkv = KvWriterDelAdd::new(&mut buffer); - for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() { + for (deladd_key, _) in KvReaderDelAdd::from_slice(deladd_obkv_bytes).iter() { obkv.insert(deladd_key, document_id.to_ne_bytes())?; } obkv.finish()?; diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 36dd20b15..7565b1ad1 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -75,7 +75,7 @@ fn extract_facet_string_docids_document_update( let mut buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { - let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes); + let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes); let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some() && deladd_reader.get(DelAdd::Addition).is_some(); @@ -163,7 +163,7 @@ fn extract_facet_string_docids_settings( let mut buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { - let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes); + let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes); let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some() && deladd_reader.get(DelAdd::Addition).is_some(); diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 93c6ab408..7678e1edf 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -83,10 +83,10 @@ pub fn extract_fid_docid_facet_values( if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids { let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { - let obkv = obkv::KvReader::new(value); + let obkv = obkv::KvReader::from_slice(value); let get_document_json_value = move |field_id, side| { obkv.get(field_id) - .map(KvReaderDelAdd::new) + .map(KvReaderDelAdd::from_slice) .and_then(|kv| kv.get(side)) .map(from_slice) .transpose() diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index f252df1cd..291dcc014 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -45,19 +45,23 @@ pub fn extract_fid_word_count_docids( .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - let del_add_reader = KvReaderDelAdd::new(value); + let del_add_reader = KvReaderDelAdd::from_slice(value); let deletion = del_add_reader // get deleted words .get(DelAdd::Deletion) // count deleted words - .map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()) + .map(|deletion| { + KvReaderU16::from_slice(deletion).iter().take(MAX_COUNTED_WORDS + 1).count() + }) // keep the count if under or equal to MAX_COUNTED_WORDS .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); let addition = del_add_reader // get added words .get(DelAdd::Addition) // count added words - .map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count()) + .map(|addition| { + KvReaderU16::from_slice(addition).iter().take(MAX_COUNTED_WORDS + 1).count() + }) // keep the count if under or equal to MAX_COUNTED_WORDS .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index ac8b7abee..fcf102eeb 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -29,11 +29,11 @@ pub fn extract_geo_points( let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { - let obkv = obkv::KvReader::new(value); + let obkv = obkv::KvReader::from_slice(value); // since we only need the primary key when we throw an error // we create this getter to lazily get it when needed let document_id = || -> Value { - let reader = KvReaderDelAdd::new(obkv.get(primary_key_id).unwrap()); + let reader = KvReaderDelAdd::from_slice(obkv.get(primary_key_id).unwrap()); let document_id = reader.get(DelAdd::Deletion).or(reader.get(DelAdd::Addition)).unwrap(); serde_json::from_slice(document_id).unwrap() @@ -68,15 +68,17 @@ pub fn extract_geo_points( /// Extract the finite floats lat and lng from two bytes slices. fn extract_lat_lng( - document: &obkv::KvReader<'_, FieldId>, + document: &obkv::KvReader, settings: &InnerIndexSettings, deladd: DelAdd, document_id: impl Fn() -> Value, ) -> Result> { match settings.geo_fields_ids { Some((lat_fid, lng_fid)) => { - let lat = document.get(lat_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd)); - let lng = document.get(lng_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd)); + let lat = + document.get(lat_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd)); + let lng = + document.get(lng_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd)); let (lat, lng) = match (lat, lng) { (Some(lat), Some(lng)) => (lat, lng), (Some(_), None) => { diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index f66c3fd46..6de555e4a 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -307,7 +307,7 @@ pub fn extract_vector_points( debug_assert!(from_utf8(external_id_bytes).is_ok()); let docid = DocumentId::from_be_bytes(docid_bytes); - let obkv = obkv::KvReader::new(value); + let obkv = obkv::KvReader::from_slice(value); key_buffer.clear(); key_buffer.extend_from_slice(docid_bytes.as_slice()); @@ -475,7 +475,7 @@ pub fn extract_vector_points( #[allow(clippy::too_many_arguments)] // feel free to find efficient way to factor arguments fn extract_vector_document_diff( docid: DocumentId, - obkv: obkv::KvReader<'_, FieldId>, + obkv: &obkv::KvReader, prompt: &Prompt, (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), (old, new): (VectorState, VectorState), @@ -517,7 +517,7 @@ fn extract_vector_document_diff( // Do we keep this document? let document_is_kept = obkv .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept { @@ -553,7 +553,7 @@ fn extract_vector_document_diff( // Do we keep this document? let document_is_kept = obkv .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept { if embedder_is_manual { @@ -579,7 +579,7 @@ fn extract_vector_document_diff( // Do we keep this document? let document_is_kept = obkv .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept { // if the new version of documents has the vectors in the DB, @@ -597,7 +597,7 @@ fn extract_vector_document_diff( } fn regenerate_if_prompt_changed( - obkv: obkv::KvReader<'_, FieldId>, + obkv: &obkv::KvReader, (old_prompt, new_prompt): (&Prompt, &Prompt), (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), ) -> Result { @@ -612,7 +612,7 @@ fn regenerate_if_prompt_changed( } fn regenerate_prompt( - obkv: obkv::KvReader<'_, FieldId>, + obkv: &obkv::KvReader, prompt: &Prompt, new_fields_ids_map: &FieldsIdsMap, ) -> Result { diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 457d2359e..a14f39e01 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -58,17 +58,17 @@ pub fn extract_word_docids( let document_id = u32::from_be_bytes(document_id_bytes); let fid = u16::from_be_bytes(fid_bytes); - let del_add_reader = KvReaderDelAdd::new(value); + let del_add_reader = KvReaderDelAdd::from_slice(value); // extract all unique words to remove. if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { - for (_pos, word) in KvReaderU16::new(deletion).iter() { + for (_pos, word) in KvReaderU16::from_slice(deletion).iter() { del_words.insert(word.to_vec()); } } // extract all unique additional words. if let Some(addition) = del_add_reader.get(DelAdd::Addition) { - for (_pos, word) in KvReaderU16::new(addition).iter() { + for (_pos, word) in KvReaderU16::from_slice(addition).iter() { add_words.insert(word.to_vec()); } } @@ -115,7 +115,7 @@ pub fn extract_word_docids( // NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters. while let Some((key, value)) = iter.next()? { // only keep the value if their is a change to apply in the DB. - if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) { + if !is_noop_del_add_obkv(KvReaderDelAdd::from_slice(value)) { word_fid_docids_writer.insert(key, value)?; } @@ -123,7 +123,7 @@ pub fn extract_word_docids( .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; // merge all deletions - let obkv = KvReaderDelAdd::new(value); + let obkv = KvReaderDelAdd::from_slice(value); if let Some(value) = obkv.get(DelAdd::Deletion) { let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid); buffer.clear(); diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 5a9363942..01344563f 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -92,8 +92,8 @@ pub fn extract_word_pair_proximity_docids( } // deletions - if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { - for (position, word) in KvReaderU16::new(deletion).iter() { + if let Some(deletion) = KvReaderDelAdd::from_slice(value).get(DelAdd::Deletion) { + for (position, word) in KvReaderU16::from_slice(deletion).iter() { // drain the proximity window until the head word is considered close to the word we are inserting. while del_word_positions.front().map_or(false, |(_w, p)| { index_proximity(*p as u32, position as u32) >= MAX_DISTANCE @@ -125,8 +125,8 @@ pub fn extract_word_pair_proximity_docids( } // additions - if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { - for (position, word) in KvReaderU16::new(addition).iter() { + if let Some(addition) = KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) { + for (position, word) in KvReaderU16::from_slice(addition).iter() { // drain the proximity window until the head word is considered close to the word we are inserting. while add_word_positions.front().map_or(false, |(_w, p)| { index_proximity(*p as u32, position as u32) >= MAX_DISTANCE diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 50b1617f9..7f14d6075 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -60,10 +60,10 @@ pub fn extract_word_position_docids( current_document_id = Some(document_id); - let del_add_reader = KvReaderDelAdd::new(value); + let del_add_reader = KvReaderDelAdd::from_slice(value); // extract all unique words to remove. if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { - for (position, word_bytes) in KvReaderU16::new(deletion).iter() { + for (position, word_bytes) in KvReaderU16::from_slice(deletion).iter() { let position = bucketed_position(position); del_word_positions.insert((position, word_bytes.to_vec())); } @@ -71,7 +71,7 @@ pub fn extract_word_position_docids( // extract all unique additional words. if let Some(addition) = del_add_reader.get(DelAdd::Addition) { - for (position, word_bytes) in KvReaderU16::new(addition).iter() { + for (position, word_bytes) in KvReaderU16::from_slice(addition).iter() { let position = bucketed_position(position); add_word_positions.insert((position, word_bytes.to_vec())); } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 42784048a..51fa4e086 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -45,8 +45,8 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result, - update: obkv::KvReaderU16<'_>, + base: &obkv::KvReaderU16, + update: &obkv::KvReaderU16, merge_additions: bool, buffer: &mut Vec, ) { @@ -66,7 +66,7 @@ pub fn merge_two_del_add_obkvs( // If merge_additions is false, recreate an obkv keeping the deletions only. value_buffer.clear(); let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); - let base_reader = KvReaderDelAdd::new(v); + let base_reader = KvReaderDelAdd::from_slice(v); if let Some(deletion) = base_reader.get(DelAdd::Deletion) { value_writer.insert(DelAdd::Deletion, deletion).unwrap(); @@ -80,8 +80,8 @@ pub fn merge_two_del_add_obkvs( // merge deletions and additions. value_buffer.clear(); let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); - let base_reader = KvReaderDelAdd::new(base); - let update_reader = KvReaderDelAdd::new(update); + let base_reader = KvReaderDelAdd::from_slice(base); + let update_reader = KvReaderDelAdd::from_slice(update); // keep newest deletion. if let Some(deletion) = update_reader @@ -131,8 +131,8 @@ fn inner_merge_del_add_obkvs<'a>( break; } - let newest = obkv::KvReader::new(&acc); - let oldest = obkv::KvReader::new(¤t[1..]); + let newest = obkv::KvReader::from_slice(&acc); + let oldest = obkv::KvReader::from_slice(¤t[1..]); merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer); // we want the result of the merge into our accumulator. @@ -187,7 +187,7 @@ pub fn merge_deladd_cbo_roaring_bitmaps<'a>( let mut del_bitmaps_bytes = Vec::new(); let mut add_bitmaps_bytes = Vec::new(); for value in values { - let obkv = KvReaderDelAdd::new(value); + let obkv = KvReaderDelAdd::from_slice(value); if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { del_bitmaps_bytes.push(bitmap_bytes); } @@ -217,7 +217,7 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( buffer: &'a mut Vec, ) -> Result> { Ok(CboRoaringBitmapCodec::merge_deladd_into( - KvReaderDelAdd::new(deladd_obkv), + KvReaderDelAdd::from_slice(deladd_obkv), previous, buffer, )?) @@ -236,7 +236,7 @@ pub fn merge_deladd_btreeset_string<'a>( let mut del_set = BTreeSet::new(); let mut add_set = BTreeSet::new(); for value in values { - let obkv = KvReaderDelAdd::new(value); + let obkv = KvReaderDelAdd::from_slice(value); if let Some(bytes) = obkv.get(DelAdd::Deletion) { let set = serde_json::from_slice::>(bytes).unwrap(); for value in set { diff --git a/milli/src/update/index_documents/parallel.rs b/milli/src/update/index_documents/parallel.rs index 52e72a378..2f6bf9caf 100644 --- a/milli/src/update/index_documents/parallel.rs +++ b/milli/src/update/index_documents/parallel.rs @@ -31,14 +31,14 @@ impl<'t> ImmutableObkvs<'t> { } /// Returns the OBKVs identified by the given ID. - pub fn obkv(&self, docid: DocumentId) -> heed::Result>> { + pub fn obkv(&self, docid: DocumentId) -> heed::Result> { match self .ids .rank(docid) .checked_sub(1) .and_then(|offset| self.slices.get(offset as usize)) { - Some(bytes) => Ok(Some(KvReaderU16::new(bytes))), + Some(&bytes) => Ok(Some(bytes.into())), None => Ok(None), } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 73fa3ca7b..b9541e649 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -278,13 +278,13 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( - KvReaderU16::new(base_obkv), + KvReaderU16::from_slice(base_obkv), deladd_operation, &mut document_sorter_value_buffer, )?; self.original_sorter .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; - let base_obkv = KvReader::new(base_obkv); + let base_obkv = KvReader::from_slice(base_obkv); if let Some(flattened_obkv) = Self::flatten_from_fields_ids_map(&base_obkv, &mut self.fields_ids_map)? { @@ -292,7 +292,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( - KvReaderU16::new(&flattened_obkv), + KvReaderU16::from_slice(&flattened_obkv), deladd_operation, &mut document_sorter_value_buffer, )?; @@ -311,7 +311,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( - KvReaderU16::new(&obkv_buffer), + KvReaderU16::from_slice(&obkv_buffer), DelAddOperation::Addition, &mut document_sorter_value_buffer, )?; @@ -319,14 +319,14 @@ impl<'a, 'i> Transform<'a, 'i> { self.original_sorter .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; - let flattened_obkv = KvReader::new(&obkv_buffer); + let flattened_obkv = KvReader::from_slice(&obkv_buffer); if let Some(obkv) = Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)? { document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( - KvReaderU16::new(&obkv), + KvReaderU16::from_slice(&obkv), DelAddOperation::Addition, &mut document_sorter_value_buffer, )? @@ -519,14 +519,14 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Deletion as u8); into_del_add_obkv( - KvReaderU16::new(base_obkv), + KvReaderU16::from_slice(base_obkv), DelAddOperation::Deletion, document_sorter_value_buffer, )?; self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; // flatten it and push it as to delete in the flattened_sorter - let flattened_obkv = KvReader::new(base_obkv); + let flattened_obkv = KvReader::from_slice(base_obkv); if let Some(obkv) = Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)? { @@ -534,7 +534,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Deletion as u8); into_del_add_obkv( - KvReaderU16::new(&obkv), + KvReaderU16::from_slice(&obkv), DelAddOperation::Deletion, document_sorter_value_buffer, )?; @@ -552,7 +552,7 @@ impl<'a, 'i> Transform<'a, 'i> { target = "indexing::transform" )] fn flatten_from_fields_ids_map( - obkv: &KvReader<'_, FieldId>, + obkv: &KvReader, fields_ids_map: &mut FieldsIdsMap, ) -> Result>> { if obkv @@ -720,10 +720,10 @@ impl<'a, 'i> Transform<'a, 'i> { total_documents: self.documents_count, }); - for (key, value) in KvReader::new(val) { - let reader = KvReaderDelAdd::new(value); + for (key, value) in KvReader::from_slice(val) { + let reader = KvReaderDelAdd::from_slice(value); match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { - (None, None) => {} + (None, None) => (), (None, Some(_)) => { // New field let name = self.fields_ids_map.name(key).ok_or( @@ -837,7 +837,7 @@ impl<'a, 'i> Transform<'a, 'i> { /// then fill the provided buffers with delta documents using KvWritterDelAdd. #[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo fn rebind_existing_document( - old_obkv: KvReader<'_, FieldId>, + old_obkv: &KvReader, settings_diff: &InnerIndexSettingsDiff, modified_faceted_fields: &HashSet, mut injected_vectors: serde_json::Map, @@ -925,7 +925,7 @@ impl<'a, 'i> Transform<'a, 'i> { } let data = obkv_writer.into_inner()?; - let obkv = KvReader::::new(&data); + let obkv = KvReader::::from_slice(&data); if let Some(original_obkv_buffer) = original_obkv_buffer { original_obkv_buffer.clear(); @@ -936,7 +936,7 @@ impl<'a, 'i> Transform<'a, 'i> { // take the non-flattened version if flatten_from_fields_ids_map returns None. let mut fields_ids_map = settings_diff.new.fields_ids_map.clone(); let flattened = Self::flatten_from_fields_ids_map(&obkv, &mut fields_ids_map)?; - let flattened = flattened.as_deref().map_or(obkv, KvReader::new); + let flattened = flattened.as_deref().map_or(obkv, KvReader::from_slice); flattened_obkv_buffer.clear(); into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| { @@ -1173,21 +1173,21 @@ mod test { kv_writer.insert(0_u8, [0]).unwrap(); let buffer = kv_writer.into_inner().unwrap(); into_del_add_obkv( - KvReaderU16::new(&buffer), + KvReaderU16::from_slice(&buffer), DelAddOperation::Addition, &mut additive_doc_0, ) .unwrap(); additive_doc_0.insert(0, Operation::Addition as u8); into_del_add_obkv( - KvReaderU16::new(&buffer), + KvReaderU16::from_slice(&buffer), DelAddOperation::Deletion, &mut deletive_doc_0, ) .unwrap(); deletive_doc_0.insert(0, Operation::Deletion as u8); into_del_add_obkv( - KvReaderU16::new(&buffer), + KvReaderU16::from_slice(&buffer), DelAddOperation::DeletionAndAddition, &mut del_add_doc_0, ) @@ -1199,7 +1199,7 @@ mod test { kv_writer.insert(1_u8, [1]).unwrap(); let buffer = kv_writer.into_inner().unwrap(); into_del_add_obkv( - KvReaderU16::new(&buffer), + KvReaderU16::from_slice(&buffer), DelAddOperation::Addition, &mut additive_doc_1, ) @@ -1212,7 +1212,7 @@ mod test { kv_writer.insert(1_u8, [1]).unwrap(); let buffer = kv_writer.into_inner().unwrap(); into_del_add_obkv( - KvReaderU16::new(&buffer), + KvReaderU16::from_slice(&buffer), DelAddOperation::Addition, &mut additive_doc_0_1, ) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 9de95778b..9fe152348 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -162,7 +162,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); - let reader: KvReader<'_, FieldId> = KvReader::new(reader); + let reader: &KvReader = reader.into(); let (document_id_bytes, external_id_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?; @@ -170,7 +170,7 @@ pub(crate) fn write_typed_chunk_into_index( let external_id = std::str::from_utf8(external_id_bytes)?; for (field_id, value) in reader.iter() { - let del_add_reader = KvReaderDelAdd::new(value); + let del_add_reader = KvReaderDelAdd::from_slice(value); if let Some(addition) = del_add_reader.get(DelAdd::Addition) { let addition = if vectors_fid == Some(field_id) { @@ -529,7 +529,7 @@ pub(crate) fn write_typed_chunk_into_index( index.field_id_docid_facet_f64s.remap_types::(); let mut iter = merger.into_stream_merger_iter()?; while let Some((key, value)) = iter.next()? { - let reader = KvReaderDelAdd::new(value); + let reader = KvReaderDelAdd::from_slice(value); if valid_lmdb_key(key) { match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { (None, None) => {} @@ -563,7 +563,7 @@ pub(crate) fn write_typed_chunk_into_index( index.field_id_docid_facet_strings.remap_types::(); let mut iter = merger.into_stream_merger_iter()?; while let Some((key, value)) = iter.next()? { - let reader = KvReaderDelAdd::new(value); + let reader = KvReaderDelAdd::from_slice(value); if valid_lmdb_key(key) { match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { (None, None) => {} @@ -600,7 +600,7 @@ pub(crate) fn write_typed_chunk_into_index( // convert the key back to a u32 (4 bytes) let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - let deladd_obkv = KvReaderDelAdd::new(value); + let deladd_obkv = KvReaderDelAdd::from_slice(value); if let Some(value) = deladd_obkv.get(DelAdd::Deletion) { let geopoint = extract_geo_point(value, docid); rtree.remove(&geopoint); @@ -723,7 +723,7 @@ pub(crate) fn write_typed_chunk_into_index( let (left, _index) = try_split_array_at(key).unwrap(); let docid = DocumentId::from_be_bytes(left); - let vector_deladd_obkv = KvReaderDelAdd::new(value); + let vector_deladd_obkv = KvReaderDelAdd::from_slice(value); if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { let vector: Vec = pod_collect_to_vec(value); @@ -852,7 +852,7 @@ where if valid_lmdb_key(key) { let (proximity_to_insert, word1, word2) = U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?; - let data_to_insert = match KvReaderDelAdd::new(value).get(DelAdd::Addition) { + let data_to_insert = match KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) { Some(value) => { CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)? } diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 6780be72e..15239aa3e 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -1,4 +1,3 @@ -use core::slice::SlicePattern; use std::fs::File; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; @@ -44,11 +43,11 @@ impl KeyValueEntry { } pub fn key(&self) -> &[u8] { - &self.data.as_slice()[..self.key_length] + &self.data.as_ref()[..self.key_length] } pub fn value(&self) -> &[u8] { - &self.data.as_slice()[self.key_length..] + &self.data.as_ref()[self.key_length..] } } diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index 311e22404..1764b6ee7 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -1,5 +1,5 @@ use heed::RoTxn; -use obkv2::KvReader; +use obkv::KvReader; use super::indexer::KvReaderFieldId; use crate::{DocumentId, FieldId}; diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index da76bdfee..92fcd6b0c 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -4,7 +4,8 @@ mod channel; mod items_pool; mod merge; -mod global_fields_ids_map; +/// TODO remove this +// mod global_fields_ids_map; pub type StdResult = std::result::Result; @@ -27,8 +28,7 @@ mod indexer { use super::channel::{ extractors_merger_channels, merger_writer_channels, EntryOperation, - ExtractorsMergerChannels, MergerReceiver, MergerSender, MergerWriterChannels, - WriterOperation, + ExtractorsMergerChannels, MergerReceiver, MergerSender, WriterOperation, }; use super::document_change::{Deletion, DocumentChange, Insertion, Update}; use super::items_pool::ItemsPool; @@ -44,10 +44,10 @@ mod indexer { Result, UserError, }; - pub type KvReaderFieldId = obkv2::KvReader; - pub type KvReaderDelAdd = obkv2::KvReader; - pub type KvWriterFieldId = obkv2::KvWriter; - pub type KvWriterDelAdd = obkv2::KvWriter; + pub type KvReaderFieldId = obkv::KvReader; + pub type KvReaderDelAdd = obkv::KvReader; + pub type KvWriterFieldId = obkv::KvWriter; + pub type KvWriterDelAdd = obkv::KvWriter; pub struct DocumentOperationIndexer { operations: Vec, @@ -105,7 +105,7 @@ mod indexer { rtxn: &'a RoTxn, mut fields_ids_map: FieldsIdsMap, primary_key: &'a PrimaryKey<'a>, - ) -> Result + 'a> { + ) -> Result>> + 'a> { let documents_ids = index.documents_ids(rtxn)?; let mut available_docids = AvailableDocumentsIds::from_documents_ids(&documents_ids); let mut docids_version_offsets = HashMap::::new(); @@ -198,7 +198,7 @@ mod indexer { } let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); - docids_version_offsets.into_par_iter().map_with( + Ok(docids_version_offsets.into_par_iter().map_with( items, |context_pool, (external_docid, (internal_docid, operations))| { context_pool.with(|rtxn| match self.method { @@ -221,58 +221,7 @@ mod indexer { ), }) }, - ); - - Ok(vec![].into_par_iter()) - - // let mut file_count: usize = 0; - // for result in WalkDir::new(update_files_path) - // // TODO handle errors - // .sort_by_key(|entry| entry.metadata().unwrap().created().unwrap()) - // { - // let entry = result?; - // if !entry.file_type().is_file() { - // continue; - // } - - // let file = File::open(entry.path()) - // .with_context(|| format!("While opening {}", entry.path().display()))?; - // let content = unsafe { - // Mmap::map(&file) - // .map(Arc::new) - // .with_context(|| format!("While memory mapping {}", entry.path().display()))? - // }; - - // let reader = - // crate::documents::DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; - // let (mut batch_cursor, batch_index) = reader.into_cursor_and_fields_index(); - // batch_index.iter().for_each(|(_, name)| { - // fields_ids_map.insert(name); - // }); - // let mut offset: u32 = 0; - // while let Some(document) = batch_cursor.next_document()? { - // let primary_key = batch_index.id(primary_key).unwrap(); - // let document_id = document.get(primary_key).unwrap(); - // let document_id = std::str::from_utf8(document_id).unwrap(); - - // let document_offset = DocumentOffset { content: content.clone(), offset }; - // match docids_version_offsets.get_mut(document_id) { - // None => { - // let docid = match maindb.external_documents_ids.get(rtxn, document_id)? { - // Some(docid) => docid, - // None => sequential_docids.next().context("no more available docids")?, - // }; - // docids_version_offsets - // .insert(document_id.into(), (docid, smallvec![document_offset])); - // } - // Some((_, offsets)) => offsets.push(document_offset), - // } - // offset += 1; - // p.inc(1); - // } - - // file_count += 1; - // } + )) } } diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 9dbf025e6..8e5ccf690 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -109,14 +109,13 @@ impl ParsedVectorsDiff { pub fn new( docid: DocumentId, embedders_configs: &[IndexEmbeddingConfig], - documents_diff: KvReader<'_, FieldId>, + documents_diff: &KvReader, old_vectors_fid: Option, new_vectors_fid: Option, ) -> Result { let mut old = match old_vectors_fid .and_then(|vectors_fid| documents_diff.get(vectors_fid)) - .map(KvReaderDelAdd::new) - .map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) + .map(|bytes| to_vector_map(bytes.into(), DelAdd::Deletion)) .transpose() { Ok(del) => del, @@ -143,8 +142,7 @@ impl ParsedVectorsDiff { let Some(bytes) = documents_diff.get(new_vectors_fid) else { break 'new VectorsState::NoVectorsFieldInDocument; }; - let obkv = KvReaderDelAdd::new(bytes); - match to_vector_map(obkv, DelAdd::Addition)? { + match to_vector_map(bytes.into(), DelAdd::Addition)? { Some(new) => VectorsState::Vectors(new), None => VectorsState::NoVectorsFieldInDocument, } @@ -239,7 +237,7 @@ impl Error { } fn to_vector_map( - obkv: KvReaderDelAdd<'_>, + obkv: &KvReaderDelAdd, side: DelAdd, ) -> Result>, Error> { Ok(if let Some(value) = obkv.get(side) { From b7c77c7a39c3455fbb418fc2f1f6e75143721aae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 30 Aug 2024 10:03:54 +0200 Subject: [PATCH 008/247] Use the latest version of the obkv crate --- Cargo.lock | 2 +- milli/src/update/new/mod.rs | 47 ++++++++++++++++--------------------- 2 files changed, 21 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e0effa54d..18f6838ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3851,7 +3851,7 @@ dependencies = [ [[package]] name = "obkv" version = "0.3.0" -source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#9c2900d106fa84e7079b288e7f7c366ec7cae948" +source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#ce535874008ecac554f02e0c670e6caf62134d6b" [[package]] name = "once_cell" diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 92fcd6b0c..e32290c2b 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -268,9 +268,11 @@ mod indexer { .into()), }?; - /// TODO create a function for this - let current = current.as_bytes().to_vec().into_boxed_slice().into(); - Ok(DocumentChange::Deletion(Deletion::create(docid, external_docid, current))) + Ok(DocumentChange::Deletion(Deletion::create( + docid, + external_docid, + current.boxed(), + ))) }) })) } @@ -483,10 +485,11 @@ mod indexer { if operations.is_empty() { match current { Some(current) => { - /// TODO create a function for this - let current = current.as_bytes().to_vec().into_boxed_slice().into(); - let deletion = Deletion::create(docid, external_docid, current); - return Ok(Some(DocumentChange::Deletion(deletion))); + return Ok(Some(DocumentChange::Deletion(Deletion::create( + docid, + external_docid, + current.boxed(), + )))); } None => return Ok(None), } @@ -511,14 +514,11 @@ mod indexer { let mut writer = KvWriterFieldId::memory(); document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); - /// TODO create a function for this conversion - let new = writer.into_inner().unwrap().into_boxed_slice().into(); + let new = writer.into_boxed(); match current { Some(current) => { - /// TODO create a function for this conversion - let current = current.as_bytes().to_vec().into_boxed_slice().into(); - let update = Update::create(docid, external_docid, current, new); + let update = Update::create(docid, external_docid, current.boxed(), new); Ok(Some(DocumentChange::Update(update))) } None => { @@ -561,14 +561,11 @@ mod indexer { document_entries .into_iter() .for_each(|(id, value)| writer.insert(id, value).unwrap()); - /// TODO create a function for this conversion - let new = writer.into_inner().unwrap().into_boxed_slice().into(); + let new = writer.into_boxed(); match current { Some(current) => { - /// TODO create a function for this conversion - let current = current.as_bytes().to_vec().into_boxed_slice().into(); - let update = Update::create(docid, external_docid, current, new); + let update = Update::create(docid, external_docid, current.boxed(), new); Ok(Some(DocumentChange::Update(update))) } None => { @@ -577,17 +574,13 @@ mod indexer { } } } - Some(DocumentOperation::Deletion) => { - match current { - Some(current) => { - /// TODO create a function for this conversion - let current = current.as_bytes().to_vec().into_boxed_slice().into(); - let deletion = Deletion::create(docid, external_docid, current); - Ok(Some(DocumentChange::Deletion(deletion))) - } - None => Ok(None), + Some(DocumentOperation::Deletion) => match current { + Some(current) => { + let deletion = Deletion::create(docid, external_docid, current.boxed()); + Ok(Some(DocumentChange::Deletion(deletion))) } - } + None => Ok(None), + }, None => Ok(None), } } From 794ebcd5826008f42867986e7589777af5fff83b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 30 Aug 2024 11:49:47 +0200 Subject: [PATCH 009/247] Replace grenad with the new grenad various-improvement branch --- Cargo.lock | 19 +- index-scheduler/src/batch.rs | 6 +- meilisearch/src/search/mod.rs | 2 +- meilitool/src/main.rs | 2 +- milli/Cargo.toml | 5 +- milli/src/index.rs | 2 +- milli/src/lib.rs | 2 +- milli/src/search/new/db_cache.rs | 45 ++-- milli/src/update/facet/bulk.rs | 8 +- milli/src/update/facet/incremental.rs | 6 +- milli/src/update/facet/mod.rs | 19 +- milli/src/update/index_documents/enrich.rs | 4 +- .../extract/extract_docid_word_positions.rs | 14 +- .../extract/extract_facet_number_docids.rs | 4 +- .../extract/extract_facet_string_docids.rs | 10 +- .../extract/extract_fid_docid_facet_values.rs | 28 +- .../extract/extract_fid_word_count_docids.rs | 6 +- .../extract/extract_geo_points.rs | 6 +- .../extract/extract_word_docids.rs | 13 +- .../extract_word_pair_proximity_docids.rs | 8 +- .../extract/extract_word_position_docids.rs | 9 +- .../index_documents/helpers/grenad_helpers.rs | 32 +-- .../helpers/merge_functions.rs | 247 +++++++++++------- .../src/update/index_documents/helpers/mod.rs | 13 +- milli/src/update/index_documents/mod.rs | 26 +- milli/src/update/index_documents/transform.rs | 51 ++-- .../src/update/index_documents/typed_chunk.rs | 77 +++--- milli/src/update/mod.rs | 5 +- milli/src/update/new/channel.rs | 2 +- .../merge/del_add_roaring_bitmap_merger.rs | 2 +- milli/src/update/new/mod.rs | 12 +- milli/src/update/word_prefix_docids.rs | 11 +- .../src/update/words_prefix_integer_docids.rs | 11 +- 33 files changed, 367 insertions(+), 340 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 18f6838ed..281c0ab9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2221,25 +2221,15 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "grenad" version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "350d89047298d3b1b40050acd11ab76e487b854a104b760ebc5a7f375093de77" +source = "git+https://github.com/meilisearch/grenad?branch=various-improvements#58ac87d852413571102f44c5e55ca13509a3f1a0" dependencies = [ "bytemuck", "byteorder", + "either", "rayon", "tempfile", ] -[[package]] -name = "grenad" -version = "0.4.7" -source = "git+https://github.com/meilisearch/grenad?branch=various-improvements#d7512aedb854c247acc7cd18d0bfa148d3779923" -dependencies = [ - "bytemuck", - "byteorder", - "tempfile", -] - [[package]] name = "h2" version = "0.3.26" @@ -2848,7 +2838,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e310b3a6b5907f99202fcdb4960ff45b93735d7c7d96b760fcff8db2dc0e103d" dependencies = [ "cfg-if", - "windows-targets 0.52.4", + "windows-targets 0.48.1", ] [[package]] @@ -3584,8 +3574,7 @@ dependencies = [ "fst", "fxhash", "geoutils", - "grenad 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)", - "grenad 0.4.7 (git+https://github.com/meilisearch/grenad?branch=various-improvements)", + "grenad", "heed", "hf-hub", "indexmap", diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 3e6e78614..1a056dde9 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -909,10 +909,8 @@ impl IndexScheduler { while let Some(doc) = cursor.next_document().map_err(milli::Error::from)? { - dump_content_file.push_document(&obkv_to_object( - &doc, - &documents_batch_index, - )?)?; + dump_content_file + .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; } dump_content_file.flush()?; } diff --git a/meilisearch/src/search/mod.rs b/meilisearch/src/search/mod.rs index 4ada47ff1..54d0c4823 100644 --- a/meilisearch/src/search/mod.rs +++ b/meilisearch/src/search/mod.rs @@ -1642,7 +1642,7 @@ fn add_non_formatted_ids_to_formatted_options( fn make_document( displayed_attributes: &BTreeSet, field_ids_map: &FieldsIdsMap, - obkv: obkv::KvReaderU16, + obkv: &obkv::KvReaderU16, ) -> Result { let mut document = serde_json::Map::new(); diff --git a/meilitool/src/main.rs b/meilitool/src/main.rs index 06c4890a5..f908dc4b0 100644 --- a/meilitool/src/main.rs +++ b/meilitool/src/main.rs @@ -244,7 +244,7 @@ fn export_a_dump( format!("While iterating on content file {:?}", content_file_uuid) })? { dump_content_file - .push_document(&obkv_to_object(&doc, &documents_batch_index)?)?; + .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; } dump_content_file.flush()?; count += 1; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b15f72f15..7059ed7f5 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -28,10 +28,7 @@ fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" grenad = { version = "0.4.7", default-features = false, features = [ - "rayon", - "tempfile", -] } -grenad2 = { package = "grenad", version = "0.4.7", default-features = false, features = [ + "rayon", # TODO Should we keep this feature "tempfile" ], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" } heed = { version = "0.20.3", default-features = false, features = [ diff --git a/milli/src/index.rs b/milli/src/index.rs index 9c582b97a..58b3a6bf4 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1311,7 +1311,7 @@ impl Index { })?; Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> { let (_docid, obkv) = entry?; - match primary_key.document_id(&obkv, &fields)? { + match primary_key.document_id(obkv, &fields)? { Ok(document_id) => Ok(document_id), Err(_) => Err(InternalError::DocumentsError( crate::documents::Error::InvalidDocumentFormat, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index bb8325791..8b2468bea 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -431,7 +431,7 @@ mod tests { writer.insert(id1, b"1234").unwrap(); writer.insert(id2, b"4321").unwrap(); let contents = writer.into_inner().unwrap(); - let obkv = obkv::KvReaderU16::new(&contents); + let obkv = obkv::KvReaderU16::from_slice(&contents); let expected = json!({ "field1": 1234, diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index d1d9d6d9a..243303ba2 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -3,6 +3,7 @@ use std::collections::hash_map::Entry; use std::hash::Hash; use fxhash::FxHashMap; +use grenad::MergeFunction; use heed::types::Bytes; use heed::{BytesEncode, Database, RoTxn}; use roaring::RoaringBitmap; @@ -11,7 +12,7 @@ use super::interner::Interned; use super::Word; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::proximity::ProximityPrecision; -use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; +use crate::update::MergeCboRoaringBitmaps; use crate::{ CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, }; @@ -110,19 +111,21 @@ impl<'ctx> DatabaseCache<'ctx> { .map_err(Into::into) } - fn get_value_from_keys<'v, K1, KC>( + fn get_value_from_keys<'v, K1, KC, MF>( txn: &'ctx RoTxn<'_>, cache_key: K1, db_keys: &'v [KC::EItem], cache: &mut FxHashMap>>, db: Database, universe: Option<&RoaringBitmap>, - merger: MergeFn, + merger: MF, ) -> Result> where K1: Copy + Eq + Hash, KC: BytesEncode<'v>, KC::EItem: Sized, + MF: MergeFunction, + crate::Error: From, { if let Entry::Vacant(entry) = cache.entry(cache_key) { let bitmap_ptr: Option> = match db_keys { @@ -138,7 +141,7 @@ impl<'ctx> DatabaseCache<'ctx> { if bitmaps.is_empty() { None } else { - Some(merger(&[], &bitmaps[..])?) + Some(merger.merge(&[], &bitmaps[..])?) } } }; @@ -213,17 +216,17 @@ impl<'ctx> SearchContext<'ctx> { let keys: Vec<_> = restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _>( + DatabaseCache::get_value_from_keys( self.txn, word, &keys[..], &mut self.db_cache.word_docids, self.index.word_fid_docids.remap_data_type::(), universe, - merge_cbo_roaring_bitmaps, + MergeCboRoaringBitmaps, ) } - None => DatabaseCache::get_value::<_, _>( + None => DatabaseCache::get_value( self.txn, word, self.word_interner.get(word).as_str(), @@ -245,17 +248,17 @@ impl<'ctx> SearchContext<'ctx> { let keys: Vec<_> = restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _>( + DatabaseCache::get_value_from_keys( self.txn, word, &keys[..], &mut self.db_cache.exact_word_docids, self.index.word_fid_docids.remap_data_type::(), universe, - merge_cbo_roaring_bitmaps, + MergeCboRoaringBitmaps, ) } - None => DatabaseCache::get_value::<_, _>( + None => DatabaseCache::get_value( self.txn, word, self.word_interner.get(word).as_str(), @@ -302,17 +305,17 @@ impl<'ctx> SearchContext<'ctx> { let keys: Vec<_> = restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _>( + DatabaseCache::get_value_from_keys( self.txn, prefix, &keys[..], &mut self.db_cache.word_prefix_docids, self.index.word_prefix_fid_docids.remap_data_type::(), universe, - merge_cbo_roaring_bitmaps, + MergeCboRoaringBitmaps, ) } - None => DatabaseCache::get_value::<_, _>( + None => DatabaseCache::get_value( self.txn, prefix, self.word_interner.get(prefix).as_str(), @@ -334,17 +337,17 @@ impl<'ctx> SearchContext<'ctx> { let keys: Vec<_> = restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _>( + DatabaseCache::get_value_from_keys( self.txn, prefix, &keys[..], &mut self.db_cache.exact_word_prefix_docids, self.index.word_prefix_fid_docids.remap_data_type::(), universe, - merge_cbo_roaring_bitmaps, + MergeCboRoaringBitmaps, ) } - None => DatabaseCache::get_value::<_, _>( + None => DatabaseCache::get_value( self.txn, prefix, self.word_interner.get(prefix).as_str(), @@ -405,7 +408,7 @@ impl<'ctx> SearchContext<'ctx> { Ok(docids) } - ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _>( + ProximityPrecision::ByWord => DatabaseCache::get_value( self.txn, (proximity, word1, word2), &( @@ -538,7 +541,7 @@ impl<'ctx> SearchContext<'ctx> { return Ok(None); } - DatabaseCache::get_value::<_, _>( + DatabaseCache::get_value( self.txn, (word, fid), &(self.word_interner.get(word).as_str(), fid), @@ -559,7 +562,7 @@ impl<'ctx> SearchContext<'ctx> { return Ok(None); } - DatabaseCache::get_value::<_, _>( + DatabaseCache::get_value( self.txn, (word_prefix, fid), &(self.word_interner.get(word_prefix).as_str(), fid), @@ -629,7 +632,7 @@ impl<'ctx> SearchContext<'ctx> { word: Interned, position: u16, ) -> Result> { - DatabaseCache::get_value::<_, _>( + DatabaseCache::get_value( self.txn, (word, position), &(self.word_interner.get(word).as_str(), position), @@ -645,7 +648,7 @@ impl<'ctx> SearchContext<'ctx> { word_prefix: Interned, position: u16, ) -> Result> { - DatabaseCache::get_value::<_, _>( + DatabaseCache::get_value( self.txn, (word_prefix, position), &(self.word_interner.get(word_prefix).as_str(), position), diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 27de6e777..19dfc310b 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -14,7 +14,7 @@ use crate::heed_codec::facet::{ use crate::heed_codec::BytesRefCodec; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; -use crate::update::MergeFn; +use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result}; /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases @@ -29,7 +29,7 @@ pub struct FacetsUpdateBulk<'i> { facet_type: FacetType, field_ids: Vec, // None if level 0 does not need to be updated - delta_data: Option, MergeFn>>, + delta_data: Option, MergeDeladdCboRoaringBitmaps>>, } impl<'i> FacetsUpdateBulk<'i> { @@ -37,7 +37,7 @@ impl<'i> FacetsUpdateBulk<'i> { index: &'i Index, field_ids: Vec, facet_type: FacetType, - delta_data: Merger, MergeFn>, + delta_data: Merger, MergeDeladdCboRoaringBitmaps>, group_size: u8, min_level_size: u8, ) -> FacetsUpdateBulk<'i> { @@ -90,7 +90,7 @@ impl<'i> FacetsUpdateBulk<'i> { /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { pub db: heed::Database, FacetGroupValueCodec>, - pub delta_data: Option>, + pub delta_data: Option>, pub group_size: u8, pub min_level_size: u8, } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 637f84986..a1fa07fe3 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -15,7 +15,7 @@ use crate::heed_codec::BytesRefCodec; use crate::search::facet::get_highest_level; use crate::update::del_add::DelAdd; use crate::update::index_documents::valid_lmdb_key; -use crate::update::MergeFn; +use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{CboRoaringBitmapCodec, Index, Result}; /// Enum used as a return value for the facet incremental indexing. @@ -57,14 +57,14 @@ enum ModificationResult { /// `facet_id_(string/f64)_docids` databases. pub struct FacetsUpdateIncremental { inner: FacetsUpdateIncrementalInner, - delta_data: Merger, MergeFn>, + delta_data: Merger, MergeDeladdCboRoaringBitmaps>, } impl FacetsUpdateIncremental { pub fn new( index: &Index, facet_type: FacetType, - delta_data: Merger, MergeFn>, + delta_data: Merger, MergeDeladdCboRoaringBitmaps>, group_size: u8, min_level_size: u8, max_group_size: u8, diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index bccfdff12..2e592519b 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -86,12 +86,11 @@ use time::OffsetDateTime; use tracing::debug; use self::incremental::FacetsUpdateIncremental; -use super::FacetsUpdateBulk; +use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps}; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::BytesRefCodec; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; -use crate::update::MergeFn; use crate::{try_split_array_at, FieldId, Index, Result}; pub mod bulk; @@ -105,8 +104,8 @@ pub struct FacetsUpdate<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, - delta_data: Merger, MergeFn>, - normalized_delta_data: Option, MergeFn>>, + delta_data: Merger, MergeDeladdCboRoaringBitmaps>, + normalized_delta_data: Option, MergeDeladdBtreesetString>>, group_size: u8, max_group_size: u8, min_level_size: u8, @@ -116,8 +115,8 @@ impl<'i> FacetsUpdate<'i> { pub fn new( index: &'i Index, facet_type: FacetType, - delta_data: Merger, MergeFn>, - normalized_delta_data: Option, MergeFn>>, + delta_data: Merger, MergeDeladdCboRoaringBitmaps>, + normalized_delta_data: Option, MergeDeladdBtreesetString>>, data_size: u64, ) -> Self { let database = match facet_type { @@ -182,7 +181,7 @@ impl<'i> FacetsUpdate<'i> { fn index_facet_search( wtxn: &mut heed::RwTxn<'_>, - normalized_delta_data: Merger, MergeFn>, + normalized_delta_data: Merger, MergeDeladdBtreesetString>, index: &Index, ) -> Result<()> { let mut iter = normalized_delta_data.into_stream_merger_iter()?; @@ -298,8 +297,8 @@ pub(crate) mod test_helpers { use crate::search::facet::get_highest_level; use crate::snapshot_tests::display_bitmap; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; - use crate::update::index_documents::merge_deladd_cbo_roaring_bitmaps; - use crate::update::{FacetsUpdateIncrementalInner, MergeFn}; + use crate::update::index_documents::MergeDeladdCboRoaringBitmaps; + use crate::update::FacetsUpdateIncrementalInner; use crate::CboRoaringBitmapCodec; /// Utility function to generate a string whose position in a lexicographically @@ -484,7 +483,7 @@ pub(crate) mod test_helpers { } writer.finish().unwrap(); let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); - let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); builder.push(reader.into_cursor().unwrap()); let merger = builder.build(); diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index a93d6f9f1..85f871830 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -47,7 +47,7 @@ pub fn enrich_documents_batch( return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), - document: obkv_to_object(&first_document, &documents_batch_index)?, + document: obkv_to_object(first_document, &documents_batch_index)?, })), None => unreachable!("Called with reader.is_empty()"), }; @@ -106,7 +106,7 @@ pub fn enrich_documents_batch( let mut count = 0; while let Some(document) = cursor.next_document()? { let document_id = match fetch_or_generate_document_id( - &document, + document, &documents_batch_index, primary_key, autogenerate_docids, diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index a939827d5..716e4dd6b 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -8,7 +8,7 @@ use obkv::{KvReader, KvWriterU16}; use roaring::RoaringBitmap; use serde_json::Value; -use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; +use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepLatestObkv}; use crate::error::{InternalError, SerializationError}; use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; @@ -35,7 +35,7 @@ pub fn extract_docid_word_positions( let mut documents_ids = RoaringBitmap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, - keep_latest_obkv, + KeepLatestObkv, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -83,7 +83,7 @@ pub fn extract_docid_word_positions( let obkv = KvReader::::from_slice(value); // if the searchable fields didn't change, skip the searchable indexing for this document. - if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) { + if !force_reindexing && !searchable_fields_changed(obkv, settings_diff) { continue; } @@ -98,7 +98,7 @@ pub fn extract_docid_word_positions( || { // deletions tokens_from_document( - &obkv, + obkv, &settings_diff.old, &del_tokenizer, max_positions_per_attributes, @@ -109,7 +109,7 @@ pub fn extract_docid_word_positions( || { // additions tokens_from_document( - &obkv, + obkv, &settings_diff.new, &add_tokenizer, max_positions_per_attributes, @@ -126,8 +126,8 @@ pub fn extract_docid_word_positions( // transforming two KV> into one KV>> value_buffer.clear(); del_add_from_two_obkvs( - &KvReader::::from_slice(del_obkv), - &KvReader::::from_slice(add_obkv), + KvReader::::from_slice(del_obkv), + KvReader::::from_slice(add_obkv), &mut value_buffer, )?; diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 478631dea..8a5a93270 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -4,7 +4,7 @@ use std::io::{self, BufReader}; use heed::{BytesDecode, BytesEncode}; use super::helpers::{ - create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, + create_sorter, sorter_into_reader, GrenadParameters, MergeDeladdCboRoaringBitmaps, }; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, @@ -27,7 +27,7 @@ pub fn extract_facet_number_docids( let mut facet_number_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 7565b1ad1..f7bdcbb56 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -15,7 +15,7 @@ use crate::heed_codec::{BEU16StrCodec, StrRefCodec}; use crate::localized_attributes_rules::LocalizedFieldIds; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::{ - merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, + MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps, }; use crate::update::settings::InnerIndexSettingsDiff; use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; @@ -56,7 +56,7 @@ fn extract_facet_string_docids_document_update( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -65,7 +65,7 @@ fn extract_facet_string_docids_document_update( let mut normalized_facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_deladd_btreeset_string, + MergeDeladdBtreesetString, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -144,7 +144,7 @@ fn extract_facet_string_docids_settings( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -153,7 +153,7 @@ fn extract_facet_string_docids_settings( let mut normalized_facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_deladd_btreeset_string, + MergeDeladdBtreesetString, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 7678e1edf..f7f447ca9 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,10 +1,8 @@ -use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; use std::mem::size_of; -use std::result::Result as StdResult; use bytemuck::bytes_of; use grenad::Sorter; @@ -15,13 +13,13 @@ use roaring::RoaringBitmap; use serde_json::{from_slice, Value}; use FilterableValues::{Empty, Null, Values}; -use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; +use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepFirst}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::update::settings::InnerIndexSettingsDiff; -use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; +use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::() + size_of::(); @@ -50,7 +48,7 @@ pub fn extract_fid_docid_facet_values( let mut fid_docid_facet_numbers_sorter = create_sorter( grenad::SortAlgorithm::Stable, - keep_first, + KeepFirst, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -59,7 +57,7 @@ pub fn extract_fid_docid_facet_values( let mut fid_docid_facet_strings_sorter = create_sorter( grenad::SortAlgorithm::Stable, - keep_first, + KeepFirst, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -330,15 +328,12 @@ fn truncate_str(s: &str) -> &str { /// Computes the diff between both Del and Add numbers and /// only inserts the parts that differ in the sorter. -fn insert_numbers_diff( - fid_docid_facet_numbers_sorter: &mut Sorter, +fn insert_numbers_diff( + fid_docid_facet_numbers_sorter: &mut Sorter, key_buffer: &mut Vec, mut del_numbers: Vec, mut add_numbers: Vec, -) -> Result<()> -where - MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, Error>, -{ +) -> Result<()> { // We sort and dedup the float numbers del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); @@ -390,15 +385,12 @@ where /// Computes the diff between both Del and Add strings and /// only inserts the parts that differ in the sorter. -fn insert_strings_diff( - fid_docid_facet_strings_sorter: &mut Sorter, +fn insert_strings_diff( + fid_docid_facet_strings_sorter: &mut Sorter, key_buffer: &mut Vec, mut del_strings: Vec<(String, String)>, mut add_strings: Vec<(String, String)>, -) -> Result<()> -where - MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, Error>, -{ +) -> Result<()> { // We sort and dedup the normalized and original strings del_strings.sort_unstable(); add_strings.sort_unstable(); diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 291dcc014..784de5d94 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -4,8 +4,8 @@ use std::io::{self, BufReader}; use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, - GrenadParameters, + create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters, + MergeDeladdCboRoaringBitmaps, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; @@ -30,7 +30,7 @@ pub fn extract_fid_word_count_docids( let mut fid_word_count_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index fcf102eeb..84f5e556b 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -40,11 +40,9 @@ pub fn extract_geo_points( }; // extract old version - let del_lat_lng = - extract_lat_lng(&obkv, &settings_diff.old, DelAdd::Deletion, document_id)?; + let del_lat_lng = extract_lat_lng(obkv, &settings_diff.old, DelAdd::Deletion, document_id)?; // extract new version - let add_lat_lng = - extract_lat_lng(&obkv, &settings_diff.new, DelAdd::Addition, document_id)?; + let add_lat_lng = extract_lat_lng(obkv, &settings_diff.new, DelAdd::Addition, document_id)?; if del_lat_lng != add_lat_lng { let mut obkv = KvWriterDelAdd::memory(); diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index a14f39e01..70db9d759 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -7,8 +7,8 @@ use obkv::KvReaderU16; use roaring::RoaringBitmap; use super::helpers::{ - create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at, - writer_into_reader, GrenadParameters, + create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters, + MergeDeladdCboRoaringBitmaps, }; use crate::error::SerializationError; use crate::heed_codec::StrBEU16Codec; @@ -16,7 +16,6 @@ use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::sorter_into_reader; use crate::update::settings::InnerIndexSettingsDiff; -use crate::update::MergeFn; use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. @@ -40,7 +39,7 @@ pub fn extract_word_docids( let mut word_fid_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -94,7 +93,7 @@ pub fn extract_word_docids( let mut word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -103,7 +102,7 @@ pub fn extract_word_docids( let mut exact_word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -163,7 +162,7 @@ fn words_into_sorter( key_buffer: &mut Vec, del_words: &BTreeSet>, add_words: &BTreeSet>, - word_fid_docids_sorter: &mut grenad::Sorter, + word_fid_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 01344563f..705a5c96f 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -6,8 +6,8 @@ use std::{cmp, io}; use obkv::KvReaderU16; use super::helpers::{ - create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at, - writer_into_reader, GrenadParameters, MergeFn, + create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters, + MergeDeladdCboRoaringBitmaps, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; @@ -44,7 +44,7 @@ pub fn extract_word_pair_proximity_docids( .map(|_| { create_sorter( grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -197,7 +197,7 @@ fn document_word_positions_into_sorter( document_id: DocumentId, del_word_pair_proximity: &BTreeMap<(String, String), u8>, add_word_pair_proximity: &BTreeMap<(String, String), u8>, - word_pair_proximity_docids_sorters: &mut [grenad::Sorter], + word_pair_proximity_docids_sorters: &mut [grenad::Sorter], ) -> Result<()> { use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 7f14d6075..bee510bfb 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -5,14 +5,13 @@ use std::io::{self, BufReader}; use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, - GrenadParameters, + create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters, + MergeDeladdCboRoaringBitmaps, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; -use crate::update::MergeFn; use crate::{bucketed_position, DocumentId, Result}; /// Extracts the word positions and the documents ids where this word appear. @@ -29,7 +28,7 @@ pub fn extract_word_position_docids( let mut word_position_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -100,7 +99,7 @@ fn words_position_into_sorter( key_buffer: &mut Vec, del_word_positions: &BTreeSet<(u16, Vec)>, add_word_positions: &BTreeSet<(u16, Vec)>, - word_position_docids_sorter: &mut grenad::Sorter, + word_position_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 44009f2fa..1f8f7eddf 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -1,11 +1,10 @@ -use std::borrow::Cow; use std::fs::File; use std::io::{self, BufReader, BufWriter, Seek}; -use grenad::{CompressionType, Sorter}; +use grenad::{CompressionType, MergeFunction, Sorter}; use heed::types::Bytes; -use super::{ClonableMmap, MergeFn}; +use super::ClonableMmap; use crate::update::index_documents::valid_lmdb_key; use crate::Result; @@ -31,14 +30,14 @@ pub fn create_writer( /// A helper function that creates a grenad sorter /// with the given parameters. The max memory is /// clamped to something reasonable. -pub fn create_sorter( +pub fn create_sorter( sort_algorithm: grenad::SortAlgorithm, - merge: MergeFn, + merge: MF, chunk_compression_type: grenad::CompressionType, chunk_compression_level: Option, max_nb_chunks: Option, max_memory: Option, -) -> grenad::Sorter { +) -> grenad::Sorter { let mut builder = grenad::Sorter::builder(merge); builder.chunk_compression_type(chunk_compression_type); if let Some(level) = chunk_compression_level { @@ -57,10 +56,14 @@ pub fn create_sorter( } #[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")] -pub fn sorter_into_reader( - sorter: grenad::Sorter, +pub fn sorter_into_reader( + sorter: grenad::Sorter, indexer: GrenadParameters, -) -> Result>> { +) -> Result>> +where + MF: MergeFunction, + crate::Error: From, +{ let mut writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -169,8 +172,8 @@ pub fn grenad_obkv_into_chunks( /// Write provided sorter in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. #[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")] -pub fn write_sorter_into_database( - sorter: Sorter, +pub fn write_sorter_into_database( + sorter: Sorter, database: &heed::Database, wtxn: &mut heed::RwTxn<'_>, index_is_empty: bool, @@ -180,6 +183,8 @@ pub fn write_sorter_into_database( where FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, + MF: MergeFunction, + crate::Error: From, { let mut buffer = Vec::new(); let database = database.remap_types::(); @@ -207,8 +212,3 @@ where Ok(()) } - -/// Used when trying to merge readers, but you don't actually care about the values. -pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { - Ok(Cow::Owned(Vec::new())) -} diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 51fa4e086..ab8a09a60 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -3,6 +3,8 @@ use std::collections::BTreeSet; use std::io; use std::result::Result as StdResult; +use either::Either; +use grenad::MergeFunction; use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; @@ -10,7 +12,8 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::transform::Operation; use crate::Result; -pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result>; +pub type EitherObkvMerge = + Either; pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec) -> io::Result<()> { buffer.clear(); @@ -18,30 +21,48 @@ pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec) -> bitmap.serialize_into(buffer) } -pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - let merged = values - .iter() - .map(AsRef::as_ref) - .map(RoaringBitmap::deserialize_from) - .map(StdResult::unwrap) - .reduce(|a, b| a | b) - .unwrap(); - let mut buffer = Vec::new(); - serialize_roaring_bitmap(&merged, &mut buffer)?; - Ok(Cow::Owned(buffer)) +pub struct MergeRoaringBitmaps; + +impl MergeFunction for MergeRoaringBitmaps { + type Error = crate::Error; + + fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let merged = values + .iter() + .map(AsRef::as_ref) + .map(RoaringBitmap::deserialize_from) + .map(StdResult::unwrap) + .reduce(|a, b| a | b) + .unwrap(); + let mut buffer = Vec::new(); + serialize_roaring_bitmap(&merged, &mut buffer)?; + Ok(Cow::Owned(buffer)) + } } } -pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { - Ok(values[0].clone()) +pub struct KeepFirst; + +impl MergeFunction for KeepFirst { + type Error = crate::Error; + + fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + Ok(values[0].clone()) + } } /// Only the last value associated with an id is kept. -pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { - Ok(obkvs.last().unwrap().clone()) +pub struct KeepLatestObkv; + +impl MergeFunction for KeepLatestObkv { + type Error = crate::Error; + + fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { + Ok(obkvs.last().unwrap().clone()) + } } pub fn merge_two_del_add_obkvs( @@ -145,65 +166,79 @@ fn inner_merge_del_add_obkvs<'a>( } /// Merge all the obkvs from the newest to the oldest. -pub fn obkvs_merge_additions_and_deletions<'a>( - _key: &[u8], - obkvs: &[Cow<'a, [u8]>], -) -> Result> { - inner_merge_del_add_obkvs(obkvs, true) +#[derive(Copy, Clone)] +pub struct ObkvsMergeAdditionsAndDeletions; + +impl MergeFunction for ObkvsMergeAdditionsAndDeletions { + type Error = crate::Error; + + fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { + inner_merge_del_add_obkvs(obkvs, true) + } } /// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions. -pub fn obkvs_keep_last_addition_merge_deletions<'a>( - _key: &[u8], - obkvs: &[Cow<'a, [u8]>], -) -> Result> { - inner_merge_del_add_obkvs(obkvs, false) +#[derive(Copy, Clone)] +pub struct ObkvsKeepLastAdditionMergeDeletions; + +impl MergeFunction for ObkvsKeepLastAdditionMergeDeletions { + type Error = crate::Error; + + fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { + inner_merge_del_add_obkvs(obkvs, false) + } } /// Do a union of all the CboRoaringBitmaps in the values. -pub fn merge_cbo_roaring_bitmaps<'a>( - _key: &[u8], - values: &[Cow<'a, [u8]>], -) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - let mut vec = Vec::new(); - CboRoaringBitmapCodec::merge_into(values, &mut vec)?; - Ok(Cow::from(vec)) +pub struct MergeCboRoaringBitmaps; + +impl MergeFunction for MergeCboRoaringBitmaps { + type Error = crate::Error; + + fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let mut vec = Vec::new(); + CboRoaringBitmapCodec::merge_into(values, &mut vec)?; + Ok(Cow::from(vec)) + } } } /// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv /// separately and outputs a new DelAdd with both unions. -pub fn merge_deladd_cbo_roaring_bitmaps<'a>( - _key: &[u8], - values: &[Cow<'a, [u8]>], -) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - // Retrieve the bitmaps from both sides - let mut del_bitmaps_bytes = Vec::new(); - let mut add_bitmaps_bytes = Vec::new(); - for value in values { - let obkv = KvReaderDelAdd::from_slice(value); - if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { - del_bitmaps_bytes.push(bitmap_bytes); - } - if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { - add_bitmaps_bytes.push(bitmap_bytes); - } - } +pub struct MergeDeladdCboRoaringBitmaps; - let mut output_deladd_obkv = KvWriterDelAdd::memory(); - let mut buffer = Vec::new(); - CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?; - output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; - buffer.clear(); - CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?; - output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; - output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) +impl MergeFunction for MergeDeladdCboRoaringBitmaps { + type Error = crate::Error; + + fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // Retrieve the bitmaps from both sides + let mut del_bitmaps_bytes = Vec::new(); + let mut add_bitmaps_bytes = Vec::new(); + for value in values { + let obkv = KvReaderDelAdd::from_slice(value); + if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { + del_bitmaps_bytes.push(bitmap_bytes); + } + if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { + add_bitmaps_bytes.push(bitmap_bytes); + } + } + + let mut output_deladd_obkv = KvWriterDelAdd::memory(); + let mut buffer = Vec::new(); + CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?; + output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; + buffer.clear(); + CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?; + output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; + output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) + } } } @@ -225,37 +260,55 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( /// Do a union of BtreeSet on both sides of a DelAdd obkv /// separately and outputs a new DelAdd with both unions. -pub fn merge_deladd_btreeset_string<'a>( - _key: &[u8], - values: &[Cow<'a, [u8]>], -) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - // Retrieve the bitmaps from both sides - let mut del_set = BTreeSet::new(); - let mut add_set = BTreeSet::new(); - for value in values { - let obkv = KvReaderDelAdd::from_slice(value); - if let Some(bytes) = obkv.get(DelAdd::Deletion) { - let set = serde_json::from_slice::>(bytes).unwrap(); - for value in set { - del_set.insert(value); - } - } - if let Some(bytes) = obkv.get(DelAdd::Addition) { - let set = serde_json::from_slice::>(bytes).unwrap(); - for value in set { - add_set.insert(value); - } - } - } +pub struct MergeDeladdBtreesetString; - let mut output_deladd_obkv = KvWriterDelAdd::memory(); - let del = serde_json::to_vec(&del_set).unwrap(); - output_deladd_obkv.insert(DelAdd::Deletion, &del)?; - let add = serde_json::to_vec(&add_set).unwrap(); - output_deladd_obkv.insert(DelAdd::Addition, &add)?; - output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) +impl MergeFunction for MergeDeladdBtreesetString { + type Error = crate::Error; + + fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // Retrieve the bitmaps from both sides + let mut del_set = BTreeSet::new(); + let mut add_set = BTreeSet::new(); + for value in values { + let obkv = KvReaderDelAdd::from_slice(value); + if let Some(bytes) = obkv.get(DelAdd::Deletion) { + let set = serde_json::from_slice::>(bytes).unwrap(); + for value in set { + del_set.insert(value); + } + } + if let Some(bytes) = obkv.get(DelAdd::Addition) { + let set = serde_json::from_slice::>(bytes).unwrap(); + for value in set { + add_set.insert(value); + } + } + } + + let mut output_deladd_obkv = KvWriterDelAdd::memory(); + let del = serde_json::to_vec(&del_set).unwrap(); + output_deladd_obkv.insert(DelAdd::Deletion, &del)?; + let add = serde_json::to_vec(&add_set).unwrap(); + output_deladd_obkv.insert(DelAdd::Addition, &add)?; + output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) + } + } +} + +/// Used when trying to merge readers, but you don't actually care about the values. +pub struct MergeIgnoreValues; + +impl MergeFunction for MergeIgnoreValues { + type Error = crate::Error; + + fn merge<'a>( + &self, + _key: &[u8], + _values: &[Cow<'a, [u8]>], + ) -> std::result::Result, Self::Error> { + Ok(Cow::Owned(Vec::new())) } } diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 5d8f16fae..c188e324d 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -7,17 +7,8 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; -pub use grenad_helpers::{ - as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, - merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader, - GrenadParameters, -}; -pub use merge_functions::{ - keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_deladd_btreeset_string, - merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, - merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, - obkvs_merge_additions_and_deletions, MergeFn, -}; +pub use grenad_helpers::*; +pub use merge_functions::*; use crate::MAX_WORD_LENGTH; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 87c6bc6db..0cee93bdc 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -27,13 +27,7 @@ use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk}; use self::enrich::enrich_documents_batch; pub use self::enrich::{extract_finite_float_from_value, DocumentId}; -pub use self::helpers::{ - as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, - fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, - merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps, - valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn, -}; -use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; +pub use self::helpers::*; pub use self::transform::{Transform, TransformOutput}; use crate::documents::{obkv_to_object, DocumentsBatchBuilder, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; @@ -605,7 +599,7 @@ where let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; let word_docids = word_docids.get_or_insert_with(|| { - MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn) + MergerBuilder::new(MergeDeladdCboRoaringBitmaps) }); word_docids.push(cloneable_chunk.into_cursor()?); let cloneable_chunk = @@ -613,14 +607,14 @@ where let exact_word_docids = exact_word_docids.get_or_insert_with(|| { MergerBuilder::new( - merge_deladd_cbo_roaring_bitmaps as MergeFn, + MergeDeladdCboRoaringBitmaps, ) }); exact_word_docids.push(cloneable_chunk.into_cursor()?); let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; let word_fid_docids = word_fid_docids.get_or_insert_with(|| { - MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn) + MergerBuilder::new(MergeDeladdCboRoaringBitmaps) }); word_fid_docids.push(cloneable_chunk.into_cursor()?); TypedChunk::WordDocids { @@ -634,7 +628,7 @@ where let word_position_docids = word_position_docids.get_or_insert_with(|| { MergerBuilder::new( - merge_deladd_cbo_roaring_bitmaps as MergeFn, + MergeDeladdCboRoaringBitmaps, ) }); word_position_docids.push(cloneable_chunk.into_cursor()?); @@ -719,10 +713,10 @@ where )] pub fn execute_prefix_databases( self, - word_docids: Option>, - exact_word_docids: Option>, - word_position_docids: Option>, - word_fid_docids: Option>, + word_docids: Option>, + exact_word_docids: Option>, + word_position_docids: Option>, + word_fid_docids: Option>, ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -902,7 +896,7 @@ where )] fn execute_word_prefix_docids( txn: &mut heed::RwTxn<'_>, - merger: Merger, + merger: Merger, word_docids_db: Database, word_prefix_docids_db: Database, indexer_config: &IndexerConfig, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index b9541e649..c3c48a6eb 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -5,6 +5,7 @@ use std::collections::{BTreeMap, HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; +use either::Either; use fxhash::FxHashMap; use itertools::Itertools; use obkv::{KvReader, KvReaderU16, KvWriter}; @@ -13,10 +14,10 @@ use serde_json::Value; use smartstring::SmartString; use super::helpers::{ - create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions, - obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn, + create_sorter, create_writer, sorter_into_reader, EitherObkvMerge, + ObkvsKeepLastAdditionMergeDeletions, ObkvsMergeAdditionsAndDeletions, }; -use super::{IndexDocumentsMethod, IndexerConfig}; +use super::{IndexDocumentsMethod, IndexerConfig, KeepFirst}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; @@ -59,8 +60,8 @@ pub struct Transform<'a, 'i> { // Both grenad follows the same format: // key | value // u32 | 1 byte for the Operation byte, the rest is the obkv of the document stored - original_sorter: grenad::Sorter, - flattened_sorter: grenad::Sorter, + original_sorter: grenad::Sorter, + flattened_sorter: grenad::Sorter, replaced_documents_ids: RoaringBitmap, new_documents_ids: RoaringBitmap, @@ -108,17 +109,19 @@ impl<'a, 'i> Transform<'a, 'i> { index_documents_method: IndexDocumentsMethod, _autogenerate_docids: bool, ) -> Result { + use IndexDocumentsMethod::{ReplaceDocuments, UpdateDocuments}; + // We must choose the appropriate merge function for when two or more documents // with the same user id must be merged or fully replaced in the same batch. let merge_function = match index_documents_method { - IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions, - IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions, + ReplaceDocuments => Either::Left(ObkvsKeepLastAdditionMergeDeletions), + UpdateDocuments => Either::Right(ObkvsMergeAdditionsAndDeletions), }; // We initialize the sorter with the user indexing settings. let original_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_function, + merge_function.clone(), indexer_settings.chunk_compression_type, indexer_settings.chunk_compression_level, indexer_settings.max_nb_chunks, @@ -979,7 +982,7 @@ impl<'a, 'i> Transform<'a, 'i> { let mut original_sorter = if settings_diff.reindex_vectors() { Some(create_sorter( grenad::SortAlgorithm::Stable, - keep_first, + KeepFirst, self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, self.indexer_settings.max_nb_chunks, @@ -1023,7 +1026,7 @@ impl<'a, 'i> Transform<'a, 'i> { if settings_diff.reindex_searchable() || settings_diff.reindex_facets() { Some(create_sorter( grenad::SortAlgorithm::Stable, - keep_first, + KeepFirst, self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, self.indexer_settings.max_nb_chunks, @@ -1162,6 +1165,8 @@ fn drop_and_reuse(mut vec: Vec) -> Vec { #[cfg(test)] mod test { + use grenad::MergeFunction; + use super::*; #[test] @@ -1219,25 +1224,32 @@ mod test { .unwrap(); additive_doc_0_1.insert(0, Operation::Addition as u8); - let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())]) - .unwrap(); + let ret = MergeFunction::merge( + &ObkvsMergeAdditionsAndDeletions, + &[], + &[Cow::from(additive_doc_0.as_slice())], + ) + .unwrap(); assert_eq!(*ret, additive_doc_0); - let ret = obkvs_merge_additions_and_deletions( + let ret = MergeFunction::merge( + &ObkvsMergeAdditionsAndDeletions, &[], &[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())], ) .unwrap(); assert_eq!(*ret, del_add_doc_0); - let ret = obkvs_merge_additions_and_deletions( + let ret = MergeFunction::merge( + &ObkvsMergeAdditionsAndDeletions, &[], &[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())], ) .unwrap(); assert_eq!(*ret, deletive_doc_0); - let ret = obkvs_merge_additions_and_deletions( + let ret = MergeFunction::merge( + &ObkvsMergeAdditionsAndDeletions, &[], &[ Cow::from(additive_doc_1.as_slice()), @@ -1248,21 +1260,24 @@ mod test { .unwrap(); assert_eq!(*ret, del_add_doc_0); - let ret = obkvs_merge_additions_and_deletions( + let ret = MergeFunction::merge( + &ObkvsMergeAdditionsAndDeletions, &[], &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], ) .unwrap(); assert_eq!(*ret, additive_doc_0_1); - let ret = obkvs_keep_last_addition_merge_deletions( + let ret = MergeFunction::merge( + &ObkvsKeepLastAdditionMergeDeletions, &[], &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], ) .unwrap(); assert_eq!(*ret, additive_doc_0); - let ret = obkvs_keep_last_addition_merge_deletions( + let ret = MergeFunction::merge( + &ObkvsKeepLastAdditionMergeDeletions, &[], &[ Cow::from(deletive_doc_0.as_slice()), diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 9fe152348..592ace80f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -4,18 +4,17 @@ use std::fs::File; use std::io::{self, BufReader}; use bytemuck::allocation::pod_collect_to_vec; -use grenad::{Merger, MergerBuilder}; +use grenad::{MergeFunction, Merger, MergerBuilder}; use heed::types::Bytes; use heed::{BytesDecode, RwTxn}; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; use super::helpers::{ - self, keep_first, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, - merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, valid_lmdb_key, - CursorClonableMmap, + self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + CursorClonableMmap, KeepFirst, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps, + MergeIgnoreValues, }; -use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; @@ -24,7 +23,7 @@ use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{ - as_cloneable_grenad, keep_latest_obkv, try_split_array_at, + as_cloneable_grenad, try_split_array_at, KeepLatestObkv, }; use crate::update::settings::InnerIndexSettingsDiff; use crate::{ @@ -140,7 +139,7 @@ pub(crate) fn write_typed_chunk_into_index( let vectors_fid = fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); - let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn); + let mut builder = MergerBuilder::new(KeepLatestObkv); for typed_chunk in typed_chunks { let TypedChunk::Documents(chunk) = typed_chunk else { unreachable!(); @@ -234,7 +233,7 @@ pub(crate) fn write_typed_chunk_into_index( tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids"); let _entered = span.enter(); - let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); for typed_chunk in typed_chunks { let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else { unreachable!(); @@ -257,13 +256,10 @@ pub(crate) fn write_typed_chunk_into_index( let span = tracing::trace_span!(target: "indexing::write_db", "word_docids"); let _entered = span.enter(); - let mut word_docids_builder = - MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); - let mut exact_word_docids_builder = - MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); - let mut word_fid_docids_builder = - MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); - let mut fst_merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn); + let mut word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); + let mut exact_word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); + let mut word_fid_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); + let mut fst_merger_builder = MergerBuilder::new(MergeIgnoreValues); for typed_chunk in typed_chunks { let TypedChunk::WordDocids { word_docids_reader, @@ -328,7 +324,7 @@ pub(crate) fn write_typed_chunk_into_index( let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids"); let _entered = span.enter(); - let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); for typed_chunk in typed_chunks { let TypedChunk::WordPositionDocids(chunk) = typed_chunk else { unreachable!(); @@ -352,7 +348,7 @@ pub(crate) fn write_typed_chunk_into_index( tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids"); let _entered = span.enter(); - let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); let mut data_size = 0; for typed_chunk in typed_chunks { let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk @@ -374,10 +370,9 @@ pub(crate) fn write_typed_chunk_into_index( tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids"); let _entered = span.enter(); - let mut facet_id_string_builder = - MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut facet_id_string_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); let mut normalized_facet_id_string_builder = - MergerBuilder::new(merge_deladd_btreeset_string as MergeFn); + MergerBuilder::new(MergeDeladdBtreesetString); let mut data_size = 0; for typed_chunk in typed_chunks { let TypedChunk::FieldIdFacetStringDocids(( @@ -411,7 +406,7 @@ pub(crate) fn write_typed_chunk_into_index( tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids"); let _entered = span.enter(); - let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); for typed_chunk in typed_chunks { let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else { unreachable!(); @@ -435,7 +430,7 @@ pub(crate) fn write_typed_chunk_into_index( tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids"); let _entered = span.enter(); - let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); for typed_chunk in typed_chunks { let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else { unreachable!(); @@ -458,7 +453,7 @@ pub(crate) fn write_typed_chunk_into_index( let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids"); let _entered = span.enter(); - let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); for typed_chunk in typed_chunks { let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else { unreachable!(); @@ -482,7 +477,7 @@ pub(crate) fn write_typed_chunk_into_index( tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids"); let _entered = span.enter(); - let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps); for typed_chunk in typed_chunks { let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else { unreachable!(); @@ -515,7 +510,7 @@ pub(crate) fn write_typed_chunk_into_index( tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers"); let _entered = span.enter(); - let mut builder = MergerBuilder::new(keep_first as MergeFn); + let mut builder = MergerBuilder::new(KeepFirst); for typed_chunk in typed_chunks { let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else { unreachable!(); @@ -549,7 +544,7 @@ pub(crate) fn write_typed_chunk_into_index( tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings"); let _entered = span.enter(); - let mut builder = MergerBuilder::new(keep_first as MergeFn); + let mut builder = MergerBuilder::new(KeepFirst); for typed_chunk in typed_chunks { let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else { unreachable!(); @@ -582,7 +577,7 @@ pub(crate) fn write_typed_chunk_into_index( let span = tracing::trace_span!(target: "indexing::write_db", "geo_points"); let _entered = span.enter(); - let mut builder = MergerBuilder::new(keep_first as MergeFn); + let mut builder = MergerBuilder::new(KeepFirst); for typed_chunk in typed_chunks { let TypedChunk::GeoPoints(chunk) = typed_chunk else { unreachable!(); @@ -619,9 +614,9 @@ pub(crate) fn write_typed_chunk_into_index( let span = tracing::trace_span!(target: "indexing::write_db", "vector_points"); let _entered = span.enter(); - let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); - let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); - let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); + let mut remove_vectors_builder = MergerBuilder::new(KeepFirst); + let mut manual_vectors_builder = MergerBuilder::new(KeepFirst); + let mut embeddings_builder = MergerBuilder::new(KeepFirst); let mut add_to_user_provided = RoaringBitmap::new(); let mut remove_from_user_provided = RoaringBitmap::new(); let mut params = None; @@ -786,9 +781,13 @@ fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { GeoPoint::new(xyz_point, (docid, point)) } -fn merge_word_docids_reader_into_fst( - merger: Merger, -) -> Result>> { +fn merge_word_docids_reader_into_fst( + merger: Merger, +) -> Result>> +where + MF: MergeFunction, + crate::Error: From, +{ let mut iter = merger.into_stream_merger_iter()?; let mut builder = fst::SetBuilder::memory(); @@ -802,8 +801,8 @@ fn merge_word_docids_reader_into_fst( /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] -fn write_entries_into_database( - merger: Merger, +fn write_entries_into_database( + merger: Merger, database: &heed::Database, wtxn: &mut RwTxn<'_>, serialize_value: FS, @@ -813,6 +812,8 @@ where R: io::Read + io::Seek, FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, + MF: MergeFunction, + crate::Error: From, { let mut buffer = Vec::new(); let database = database.remap_types::(); @@ -839,13 +840,15 @@ where /// Akin to the `write_entries_into_database` function but specialized /// for the case when we only index additional searchable fields only. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] -fn write_proximity_entries_into_database_additional_searchables( - merger: Merger, +fn write_proximity_entries_into_database_additional_searchables( + merger: Merger, database: &heed::Database, wtxn: &mut RwTxn<'_>, ) -> Result<()> where R: io::Read + io::Seek, + MF: MergeFunction, + crate::Error: From, { let mut iter = merger.into_stream_merger_iter()?; while let Some((key, value)) = iter.next()? { diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index adfc85174..9842002a4 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -2,10 +2,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; -pub use self::index_documents::{ - merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId, - IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn, -}; +pub use self::index_documents::*; pub use self::indexer_config::IndexerConfig; pub use self::settings::{validate_embedding_settings, Setting, Settings}; pub use self::update_step::UpdateIndexingStep; diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 15239aa3e..0aeabcfa7 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -159,7 +159,7 @@ impl DocumentSender { } pub enum MergerOperation { - WordDocidsCursors(Vec>), + WordDocidsCursors(Vec>), } pub struct MergerReceiver(Receiver); diff --git a/milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs b/milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs index 5e6310170..5aa2c31f8 100644 --- a/milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs +++ b/milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::io; -use grenad2::MergeFunction; +use grenad::MergeFunction; use roaring::RoaringBitmap; use crate::update::del_add::DelAdd; diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index e32290c2b..fc587cb2a 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -198,7 +198,7 @@ mod indexer { } let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); - Ok(docids_version_offsets.into_par_iter().map_with( + docids_version_offsets.into_par_iter().map_with( items, |context_pool, (external_docid, (internal_docid, operations))| { context_pool.with(|rtxn| match self.method { @@ -221,7 +221,9 @@ mod indexer { ), }) }, - )) + ); + + Ok(vec![].into_par_iter()) } } @@ -334,13 +336,13 @@ mod indexer { thread::scope(|s| { thread::Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { document_changes.into_par_iter().for_each(|_dc| ()); - }); + })?; // TODO manage the errors correctly thread::Builder::new().name(S("indexer-merger")).spawn_scoped(s, || { let rtxn = index.read_txn().unwrap(); merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index).unwrap() - }); + })?; // TODO Split this code into another function for operation in writer_receiver { @@ -426,7 +428,7 @@ mod indexer { let sender = sender.word_docids(); let database = index.word_docids.remap_types::(); - let mut builder = grenad2::MergerBuilder::new(merge::DelAddRoaringBitmapMerger); + let mut builder = grenad::MergerBuilder::new(merge::DelAddRoaringBitmapMerger); builder.extend(cursors); /// TODO manage the error correctly let mut merger_iter = builder.build().into_stream_merger_iter().unwrap(); diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 925635f80..f683146cf 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -6,9 +6,8 @@ use heed::Database; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{ - create_sorter, merge_deladd_cbo_roaring_bitmaps, - merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, - write_sorter_into_database, CursorClonableMmap, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + write_sorter_into_database, CursorClonableMmap, MergeDeladdCboRoaringBitmaps, }; use crate::{CboRoaringBitmapCodec, Result}; @@ -47,7 +46,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> { )] pub fn execute( self, - new_word_docids: grenad::Merger, + new_word_docids: grenad::Merger, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -56,7 +55,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> { // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -139,7 +138,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> { fn write_prefixes_in_sorter( prefixes: &mut HashMap, Vec>>, - sorter: &mut grenad::Sorter, + sorter: &mut grenad::Sorter, ) -> Result<()> { for (key, data_slices) in prefixes.drain() { for data in data_slices { diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs index 9b6aa21ae..28b9b1523 100644 --- a/milli/src/update/words_prefix_integer_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -11,9 +11,8 @@ use crate::heed_codec::StrBEU16Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{ - create_sorter, merge_deladd_cbo_roaring_bitmaps, - merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, - write_sorter_into_database, CursorClonableMmap, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + write_sorter_into_database, CursorClonableMmap, MergeDeladdCboRoaringBitmaps, }; use crate::{CboRoaringBitmapCodec, Result}; @@ -52,7 +51,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { )] pub fn execute( self, - new_word_integer_docids: grenad::Merger, + new_word_integer_docids: grenad::Merger, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -61,7 +60,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { let mut prefix_integer_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, + MergeDeladdCboRoaringBitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -173,7 +172,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { fn write_prefixes_in_sorter( prefixes: &mut HashMap, Vec>>, - sorter: &mut grenad::Sorter, + sorter: &mut grenad::Sorter, ) -> Result<()> { // TODO: Merge before insertion. for (key, data_slices) in prefixes.drain() { From 54f2eb4507401b241b858e54f153f688c9102734 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 30 Aug 2024 14:33:58 +0200 Subject: [PATCH 010/247] Remove duplication of grenad merger --- .../merge/del_add_roaring_bitmap_merger.rs | 61 ------------------- milli/src/update/new/merge/mod.rs | 3 - milli/src/update/new/mod.rs | 7 ++- 3 files changed, 4 insertions(+), 67 deletions(-) delete mode 100644 milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs delete mode 100644 milli/src/update/new/merge/mod.rs diff --git a/milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs b/milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs deleted file mode 100644 index 5aa2c31f8..000000000 --- a/milli/src/update/new/merge/del_add_roaring_bitmap_merger.rs +++ /dev/null @@ -1,61 +0,0 @@ -use std::borrow::Cow; -use std::io; - -use grenad::MergeFunction; -use roaring::RoaringBitmap; - -use crate::update::del_add::DelAdd; -use crate::update::new::indexer::{KvReaderDelAdd, KvWriterDelAdd}; - -/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv -/// separately and outputs a new DelAdd with both unions. -pub struct DelAddRoaringBitmapMerger; - -impl MergeFunction for DelAddRoaringBitmapMerger { - type Error = io::Error; - - fn merge<'a>( - &self, - _key: &[u8], - values: &[Cow<'a, [u8]>], - ) -> std::result::Result, Self::Error> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - // Retrieve the bitmaps from both sides - let mut del_bitmaps_bytes = Vec::new(); - let mut add_bitmaps_bytes = Vec::new(); - for value in values { - let obkv: &KvReaderDelAdd = value.as_ref().into(); - if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { - del_bitmaps_bytes.push(bitmap_bytes); - } - if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { - add_bitmaps_bytes.push(bitmap_bytes); - } - } - - let mut output_deladd_obkv = KvWriterDelAdd::memory(); - - // Deletion - let mut buffer = Vec::new(); - let mut merged = RoaringBitmap::new(); - for bytes in del_bitmaps_bytes { - merged |= RoaringBitmap::deserialize_unchecked_from(bytes)?; - } - merged.serialize_into(&mut buffer)?; - output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; - - // Addition - buffer.clear(); - merged.clear(); - for bytes in add_bitmaps_bytes { - merged |= RoaringBitmap::deserialize_unchecked_from(bytes)?; - } - merged.serialize_into(&mut buffer)?; - output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; - - output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) - } - } -} diff --git a/milli/src/update/new/merge/mod.rs b/milli/src/update/new/merge/mod.rs deleted file mode 100644 index 6057b8d89..000000000 --- a/milli/src/update/new/merge/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod del_add_roaring_bitmap_merger; - -pub use del_add_roaring_bitmap_merger::DelAddRoaringBitmapMerger; diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index fc587cb2a..6dc600545 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -2,7 +2,6 @@ mod document_change; // mod extract; mod channel; mod items_pool; -mod merge; /// TODO remove this // mod global_fields_ids_map; @@ -38,7 +37,9 @@ mod indexer { }; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; - use crate::update::{AvailableDocumentsIds, IndexDocumentsMethod}; + use crate::update::{ + AvailableDocumentsIds, IndexDocumentsMethod, MergeDeladdCboRoaringBitmaps, + }; use crate::{ CboRoaringBitmapCodec, DocumentId, Error, FieldId, FieldsIdsMap, Index, InternalError, Result, UserError, @@ -428,7 +429,7 @@ mod indexer { let sender = sender.word_docids(); let database = index.word_docids.remap_types::(); - let mut builder = grenad::MergerBuilder::new(merge::DelAddRoaringBitmapMerger); + let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); builder.extend(cursors); /// TODO manage the error correctly let mut merger_iter = builder.build().into_stream_merger_iter().unwrap(); From 271ce91b3b8dfb15f77e156c6076d4b5b38446b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 30 Aug 2024 14:34:24 +0200 Subject: [PATCH 011/247] Add the rayon Threadpool to the index function parameter --- milli/src/update/new/mod.rs | 57 +++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 6dc600545..02b61dde4 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -22,6 +22,7 @@ mod indexer { use heed::{RoTxn, RwTxn}; use memmap2::Mmap; use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; + use rayon::ThreadPool; use roaring::RoaringBitmap; use serde_json::Value; @@ -31,7 +32,6 @@ mod indexer { }; use super::document_change::{Deletion, DocumentChange, Insertion, Update}; use super::items_pool::ItemsPool; - use super::merge; use crate::documents::{ obkv_to_object, DocumentIdExtractionError, DocumentsBatchReader, PrimaryKey, }; @@ -52,7 +52,7 @@ mod indexer { pub struct DocumentOperationIndexer { operations: Vec, - method: IndexDocumentsMethod, + index_documents_method: IndexDocumentsMethod, } enum Payload { @@ -81,7 +81,7 @@ mod indexer { impl DocumentOperationIndexer { pub fn new(method: IndexDocumentsMethod) -> Self { - Self { operations: Default::default(), method } + Self { operations: Default::default(), index_documents_method: method } } /// TODO please give me a type @@ -104,7 +104,7 @@ mod indexer { self, index: &'a Index, rtxn: &'a RoTxn, - mut fields_ids_map: FieldsIdsMap, + fields_ids_map: &'a mut FieldsIdsMap, primary_key: &'a PrimaryKey<'a>, ) -> Result>> + 'a> { let documents_ids = index.documents_ids(rtxn)?; @@ -198,33 +198,27 @@ mod indexer { } } - let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); - docids_version_offsets.into_par_iter().map_with( - items, - |context_pool, (external_docid, (internal_docid, operations))| { - context_pool.with(|rtxn| match self.method { - IndexDocumentsMethod::ReplaceDocuments => merge_document_for_replacements( + Ok(docids_version_offsets.into_par_iter().map_with( + Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))), + move |context_pool, (external_docid, (internal_docid, operations))| { + context_pool.with(|rtxn| { + use IndexDocumentsMethod as Idm; + let document_merge_function = match self.index_documents_method { + Idm::ReplaceDocuments => merge_document_for_replacements, + Idm::UpdateDocuments => merge_document_for_updates, + }; + + document_merge_function( rtxn, index, - &fields_ids_map, + fields_ids_map, internal_docid, external_docid, &operations, - ), - // TODO Remap the documents to match the db fields_ids_map - IndexDocumentsMethod::UpdateDocuments => merge_document_for_updates( - rtxn, - index, - &fields_ids_map, - internal_docid, - external_docid, - &operations, - ), + ) }) }, - ); - - Ok(vec![].into_par_iter()) + )) } } @@ -253,7 +247,7 @@ mod indexer { // process: "external_id_of", // }) // })?; - pub fn document_changes<'a, F>( + pub fn document_changes<'a>( self, index: &'a Index, fields: &'a FieldsIdsMap, @@ -263,7 +257,7 @@ mod indexer { Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| { items.with(|rtxn| { let current = index.document(rtxn, docid)?; - let external_docid = match primary_key.document_id(¤t, fields)? { + let external_docid = match primary_key.document_id(current, fields)? { Ok(document_id) => Ok(document_id) as Result<_>, Err(_) => Err(InternalError::DocumentsError( crate::documents::Error::InvalidDocumentFormat, @@ -325,7 +319,12 @@ mod indexer { /// TODO return stats /// TODO take the rayon ThreadPool - pub fn index(wtxn: &mut RwTxn, index: &Index, document_changes: PI) -> Result<()> + pub fn index( + wtxn: &mut RwTxn, + index: &Index, + pool: &ThreadPool, + document_changes: PI, + ) -> Result<()> where PI: IntoParallelIterator> + Send, PI::Iter: Clone, @@ -336,7 +335,9 @@ mod indexer { thread::scope(|s| { thread::Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { - document_changes.into_par_iter().for_each(|_dc| ()); + pool.in_place_scope(|_s| { + document_changes.into_par_iter().for_each(|_dc| ()); + }) })?; // TODO manage the errors correctly From 6487a67f2b18c63c73a2fde04801b18c8ec95d23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 30 Aug 2024 15:06:50 +0200 Subject: [PATCH 012/247] Introduce the ConcurrentAvailableIds struct and rename the other to AvailableIds --- ...able_documents_ids.rs => available_ids.rs} | 16 ++--- milli/src/update/concurrent_available_ids.rs | 59 +++++++++++++++++++ milli/src/update/index_documents/transform.rs | 6 +- milli/src/update/mod.rs | 6 +- 4 files changed, 74 insertions(+), 13 deletions(-) rename milli/src/update/{available_documents_ids.rs => available_ids.rs} (74%) create mode 100644 milli/src/update/concurrent_available_ids.rs diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_ids.rs similarity index 74% rename from milli/src/update/available_documents_ids.rs rename to milli/src/update/available_ids.rs index 3b05c5d6e..68e3dd5a6 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_ids.rs @@ -3,12 +3,12 @@ use std::ops::RangeInclusive; use roaring::bitmap::{IntoIter, RoaringBitmap}; -pub struct AvailableDocumentsIds { +pub struct AvailableIds { iter: Chain>, } -impl AvailableDocumentsIds { - pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds { +impl AvailableIds { + pub fn new(docids: &RoaringBitmap) -> AvailableIds { match docids.max() { Some(last_id) => { let mut available = RoaringBitmap::from_iter(0..last_id); @@ -20,17 +20,17 @@ impl AvailableDocumentsIds { None => 1..=0, // empty range iterator }; - AvailableDocumentsIds { iter: available.into_iter().chain(iter) } + AvailableIds { iter: available.into_iter().chain(iter) } } None => { let empty = RoaringBitmap::new().into_iter(); - AvailableDocumentsIds { iter: empty.chain(0..=u32::MAX) } + AvailableIds { iter: empty.chain(0..=u32::MAX) } } } } } -impl Iterator for AvailableDocumentsIds { +impl Iterator for AvailableIds { type Item = u32; fn next(&mut self) -> Option { @@ -45,7 +45,7 @@ mod tests { #[test] fn empty() { let base = RoaringBitmap::new(); - let left = AvailableDocumentsIds::from_documents_ids(&base); + let left = AvailableIds::new(&base); let right = 0..=u32::MAX; left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } @@ -58,7 +58,7 @@ mod tests { base.insert(100); base.insert(405); - let left = AvailableDocumentsIds::from_documents_ids(&base); + let left = AvailableIds::new(&base); let right = (0..=u32::MAX).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } diff --git a/milli/src/update/concurrent_available_ids.rs b/milli/src/update/concurrent_available_ids.rs new file mode 100644 index 000000000..f3b15ac45 --- /dev/null +++ b/milli/src/update/concurrent_available_ids.rs @@ -0,0 +1,59 @@ +use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}; + +use roaring::RoaringBitmap; + +/// A concurrent ID generate that will never return the same ID twice. +#[derive(Debug)] +pub struct ConcurrentAvailableIds { + /// The current tree node ID we should use if there is no other IDs available. + current: AtomicU32, + /// The total number of tree node IDs used. + used: AtomicU64, + + /// A list of IDs to exhaust before picking IDs from `current`. + available: RoaringBitmap, + /// The current Nth ID to select in the bitmap. + select_in_bitmap: AtomicU32, + /// Tells if you should look in the roaring bitmap or if all the IDs are already exhausted. + look_into_bitmap: AtomicBool, +} + +impl ConcurrentAvailableIds { + /// Creates an ID generator returning unique IDs, avoiding the specified used IDs. + pub fn new(used: RoaringBitmap) -> ConcurrentAvailableIds { + let last_id = used.max().map_or(0, |id| id + 1); + let used_ids = used.len(); + let available = RoaringBitmap::from_sorted_iter(0..last_id).unwrap() - used; + + ConcurrentAvailableIds { + current: AtomicU32::new(last_id), + used: AtomicU64::new(used_ids), + select_in_bitmap: AtomicU32::new(0), + look_into_bitmap: AtomicBool::new(!available.is_empty()), + available, + } + } + + /// Returns a new unique ID and increase the count of IDs used. + pub fn next(&self) -> Option { + if self.used.fetch_add(1, Ordering::Relaxed) > u32::MAX as u64 { + None + } else if self.look_into_bitmap.load(Ordering::Relaxed) { + let current = self.select_in_bitmap.fetch_add(1, Ordering::Relaxed); + match self.available.select(current) { + Some(id) => Some(id), + None => { + self.look_into_bitmap.store(false, Ordering::Relaxed); + Some(self.current.fetch_add(1, Ordering::Relaxed)) + } + } + } else { + Some(self.current.fetch_add(1, Ordering::Relaxed)) + } + } + + /// Returns the number of used ids in total. + pub fn used(&self) -> u64 { + self.used.load(Ordering::Relaxed) + } +} diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index c3c48a6eb..49bada8e7 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -27,7 +27,7 @@ use crate::update::del_add::{ }; use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; -use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::update::{AvailableIds, UpdateIndexingStep}; use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; use crate::{ @@ -55,7 +55,7 @@ pub struct Transform<'a, 'i> { indexer_settings: &'a IndexerConfig, pub index_documents_method: IndexDocumentsMethod, - available_documents_ids: AvailableDocumentsIds, + available_documents_ids: AvailableIds, // Both grenad follows the same format: // key | value @@ -143,7 +143,7 @@ impl<'a, 'i> Transform<'a, 'i> { index, fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, - available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), + available_documents_ids: AvailableIds::new(&documents_ids), original_sorter, flattened_sorter, index_documents_method, diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 9842002a4..c5e9272de 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -1,5 +1,6 @@ -pub use self::available_documents_ids::AvailableDocumentsIds; +pub use self::available_ids::AvailableIds; pub use self::clear_documents::ClearDocuments; +pub use self::concurrent_available_ids::ConcurrentAvailableIds; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::*; @@ -10,8 +11,9 @@ pub use self::word_prefix_docids::WordPrefixDocids; pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids; pub use self::words_prefixes_fst::WordsPrefixesFst; -mod available_documents_ids; +mod available_ids; mod clear_documents; +mod concurrent_available_ids; pub(crate) mod del_add; pub(crate) mod facet; mod index_documents; From b625d31c7d411d57bc49f0d5aad76f2f90f809ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 30 Aug 2024 15:07:21 +0200 Subject: [PATCH 013/247] Introduce the PartialDumpIndexer indexer that generates document ids in parallel --- milli/src/update/new/mod.rs | 108 +++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 40 deletions(-) diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 02b61dde4..3d9800657 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -21,6 +21,7 @@ mod indexer { use heed::types::Bytes; use heed::{RoTxn, RwTxn}; use memmap2::Mmap; + use obkv::KvWriter; use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; use rayon::ThreadPool; use roaring::RoaringBitmap; @@ -35,14 +36,13 @@ mod indexer { use crate::documents::{ obkv_to_object, DocumentIdExtractionError, DocumentsBatchReader, PrimaryKey, }; + use crate::update::concurrent_available_ids::ConcurrentAvailableIds; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; - use crate::update::{ - AvailableDocumentsIds, IndexDocumentsMethod, MergeDeladdCboRoaringBitmaps, - }; + use crate::update::{AvailableIds, IndexDocumentsMethod, MergeDeladdCboRoaringBitmaps}; use crate::{ - CboRoaringBitmapCodec, DocumentId, Error, FieldId, FieldsIdsMap, Index, InternalError, - Result, UserError, + all_obkv_to_json, obkv_to_json, CboRoaringBitmapCodec, DocumentId, Error, FieldId, + FieldsIdsMap, Index, InternalError, Object, Result, UserError, }; pub type KvReaderFieldId = obkv::KvReader; @@ -108,7 +108,7 @@ mod indexer { primary_key: &'a PrimaryKey<'a>, ) -> Result>> + 'a> { let documents_ids = index.documents_ids(rtxn)?; - let mut available_docids = AvailableDocumentsIds::from_documents_ids(&documents_ids); + let mut available_docids = AvailableIds::new(&documents_ids); let mut docids_version_offsets = HashMap::::new(); for operation in self.operations { @@ -127,7 +127,7 @@ mod indexer { let mut offset: u32 = 0; while let Some(document) = batch_cursor.next_document()? { let external_document_id = - match primary_key.document_id(&document, &batch_index)? { + match primary_key.document_id(document, &batch_index)? { Ok(document_id) => Ok(document_id), Err(DocumentIdExtractionError::InvalidDocumentId( user_error, @@ -135,13 +135,13 @@ mod indexer { Err(DocumentIdExtractionError::MissingDocumentId) => { Err(UserError::MissingDocumentId { primary_key: primary_key.name().to_string(), - document: obkv_to_object(&document, &batch_index)?, + document: obkv_to_object(document, &batch_index)?, }) } Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { Err(UserError::TooManyDocumentIds { primary_key: primary_key.name().to_string(), - document: obkv_to_object(&document, &batch_index)?, + document: obkv_to_object(document, &batch_index)?, }) } }?; @@ -163,7 +163,7 @@ mod indexer { }; docids_version_offsets.insert( - external_document_id.into(), + external_document_id, (docid, vec![document_operation]), ); } @@ -275,43 +275,71 @@ mod indexer { } } - pub struct DumpIndexer; + pub struct PartialDumpIndexer { + iter: I, + } - impl DumpIndexer { - pub fn new() -> Self { - todo!() + impl PartialDumpIndexer + where + I: IntoIterator, + I::IntoIter: Send, + I::Item: Send, + { + pub fn new_from_jsonlines(iter: I) -> Self { + PartialDumpIndexer { iter } } - pub fn document_changes_from_json_iter( + /// Note for future self: + /// - the field ids map must already be valid so you must have to generate it beforehand. + /// - We should probably expose another method that generates the fields ids map from an iterator of JSON objects. + /// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items). + pub fn document_changes<'a>( self, - iter: I, - index: &Index, - ) -> impl ParallelIterator + fields_ids_map: &'a FieldsIdsMap, + concurrent_available_ids: &'a ConcurrentAvailableIds, + primary_key: &'a PrimaryKey<'a>, + ) -> impl ParallelIterator>> + 'a where - I: IntoIterator, + // I don't like this, it will not fit in the future trait easily + I::IntoIter: 'a, { - // let items = Arc::new(ItemsPool::new(|| { - // let rtxn = index.read_txn()?; - // let fields = index.fields_ids_map(&rtxn)?; - // let primary_key = - // index.primary_key(&rtxn)?.ok_or(InternalError::DatabaseMissingEntry { - // db_name: db_name::MAIN, - // key: Some(main_key::PRIMARY_KEY_KEY), - // })?; - // let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| { - // InternalError::FieldIdMapMissingEntry( - // crate::FieldIdMapMissingEntry::FieldName { - // field_name: primary_key.to_owned(), - // process: "external_id_of", - // }, - // ) - // })?; - // Ok(DeleteDocumentExternalDocumentIdGetter { rtxn, fields, primary_key }) - // as crate::Result<_> - // })); + self.iter.into_iter().par_bridge().map(|object| { + let docid = match concurrent_available_ids.next() { + Some(id) => id, + None => return Err(Error::UserError(UserError::DocumentLimitReached)), + }; - todo!(); - vec![].into_par_iter() + let mut writer = KvWriterFieldId::memory(); + object.iter().for_each(|(key, value)| { + let key = fields_ids_map.id(key).unwrap(); + /// TODO better error management + let value = serde_json::to_vec(&value).unwrap(); + writer.insert(key, value).unwrap(); + }); + + let document = writer.into_boxed(); + let external_docid = match primary_key.document_id(&document, fields_ids_map)? { + Ok(document_id) => Ok(document_id), + Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => { + Err(user_error) + } + Err(DocumentIdExtractionError::MissingDocumentId) => { + Err(UserError::MissingDocumentId { + primary_key: primary_key.name().to_string(), + document: all_obkv_to_json(&document, fields_ids_map)?, + }) + } + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: all_obkv_to_json(&document, fields_ids_map)?, + }) + } + }?; + + let insertion = Insertion::create(docid, external_docid, document); + Ok(Some(DocumentChange::Insertion(insertion))) + }) } } From bb885a581001e3f299d62da05252d1a069d281c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 1 Sep 2024 23:20:19 +0200 Subject: [PATCH 014/247] Fix the merge for roaring bitmap --- milli/src/update/new/mod.rs | 47 +++++++++++++++---------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 3d9800657..a6ac38abe 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -402,37 +402,28 @@ mod indexer { del: Option<&[u8]>, add: Option<&[u8]>, ) -> Result { - let bitmap = match current { - Some(current_bitmap_bytes) => { - let bitmap_without_del = match del { - Some(del_bytes) => { - let del_bitmap = CboRoaringBitmapCodec::deserialize_from(del_bytes)?; - CboRoaringBitmapCodec::intersection_with_serialized( - current_bitmap_bytes, - &del_bitmap, - )? - } - None => CboRoaringBitmapCodec::deserialize_from(current_bitmap_bytes)?, - }; + let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; + let del = del.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; + let add = add.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; - match add { - Some(add_bytes) => { - let add = CboRoaringBitmapCodec::deserialize_from(add_bytes)?; - bitmap_without_del | add - } - None => bitmap_without_del, + match (current, del, add) { + (None, None, None) => Ok(Operation::Ignore), // but it's strange + (None, None, Some(add)) => Ok(Operation::Write(add)), + (None, Some(_del), None) => Ok(Operation::Ignore), // but it's strange + (None, Some(_del), Some(add)) => Ok(Operation::Write(add)), + (Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange + (Some(current), None, Some(add)) => Ok(Operation::Write(current | add)), + (Some(current), Some(del), add) => { + let output = match add { + Some(add) => (current - del) | add, + None => current - del, + }; + if output.is_empty() { + Ok(Operation::Delete) + } else { + Ok(Operation::Write(output)) } } - None => match add { - Some(add_bytes) => CboRoaringBitmapCodec::deserialize_from(add_bytes)?, - None => return Ok(Operation::Ignore), - }, - }; - - if bitmap.is_empty() { - Ok(Operation::Delete) - } else { - Ok(Operation::Write(bitmap)) } } From e639ec79d1eb291803269c3091940d52ff0a5f43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Sep 2024 10:42:19 +0200 Subject: [PATCH 015/247] Move the indexers into their own modules --- milli/src/update/new/channel.rs | 2 +- milli/src/update/new/document_change.rs | 2 +- .../update/new/indexer/document_deletion.rs | 53 ++ .../update/new/indexer/document_operation.rs | 325 +++++++++ milli/src/update/new/indexer/mod.rs | 82 +++ milli/src/update/new/indexer/partial_dump.rs | 73 ++ .../update/new/indexer/update_by_function.rs | 18 + milli/src/update/new/merger.rs | 97 +++ milli/src/update/new/mod.rs | 622 +----------------- 9 files changed, 668 insertions(+), 606 deletions(-) create mode 100644 milli/src/update/new/indexer/document_deletion.rs create mode 100644 milli/src/update/new/indexer/document_operation.rs create mode 100644 milli/src/update/new/indexer/mod.rs create mode 100644 milli/src/update/new/indexer/partial_dump.rs create mode 100644 milli/src/update/new/indexer/update_by_function.rs create mode 100644 milli/src/update/new/merger.rs diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 0aeabcfa7..088303fb3 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -3,8 +3,8 @@ use std::fs::File; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use heed::types::Bytes; -use super::indexer::KvReaderFieldId; use super::StdResult; +use crate::update::new::KvReaderFieldId; use crate::{DocumentId, Index}; /// The capacity of the channel is currently in number of messages. diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index 1764b6ee7..6f9d767cb 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -1,7 +1,7 @@ use heed::RoTxn; use obkv::KvReader; -use super::indexer::KvReaderFieldId; +use crate::update::new::KvReaderFieldId; use crate::{DocumentId, FieldId}; pub enum DocumentChange { diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs new file mode 100644 index 000000000..24ba0c671 --- /dev/null +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -0,0 +1,53 @@ +use std::sync::Arc; + +use rayon::iter::{ParallelBridge, ParallelIterator}; +use roaring::RoaringBitmap; + +use super::Indexer; +use crate::documents::PrimaryKey; +use crate::update::new::{Deletion, DocumentChange, ItemsPool}; +use crate::{FieldsIdsMap, Index, InternalError, Result}; + +pub struct DocumentDeletionIndexer { + pub to_delete: RoaringBitmap, +} + +impl DocumentDeletionIndexer { + pub fn new() -> Self { + Self { to_delete: Default::default() } + } + + pub fn delete_documents_by_docids(&mut self, docids: RoaringBitmap) { + self.to_delete |= docids; + } +} + +impl<'p> Indexer<'p> for DocumentDeletionIndexer { + type Parameter = (&'p Index, &'p FieldsIdsMap, &'p PrimaryKey<'p>); + + fn document_changes( + self, + param: Self::Parameter, + ) -> Result>> + 'p> { + let (index, fields, primary_key) = param; + let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); + Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| { + items.with(|rtxn| { + let current = index.document(rtxn, docid)?; + let external_docid = match primary_key.document_id(current, fields)? { + Ok(document_id) => Ok(document_id) as Result<_>, + Err(_) => Err(InternalError::DocumentsError( + crate::documents::Error::InvalidDocumentFormat, + ) + .into()), + }?; + + Ok(Some(DocumentChange::Deletion(Deletion::create( + docid, + external_docid, + current.boxed(), + )))) + }) + })) + } +} diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs new file mode 100644 index 000000000..ba93915ea --- /dev/null +++ b/milli/src/update/new/indexer/document_operation.rs @@ -0,0 +1,325 @@ +use std::borrow::Cow; +use std::collections::{BTreeMap, HashMap}; +use std::fs::File; +use std::io::Cursor; +use std::sync::Arc; + +use heed::types::Bytes; +use heed::RoTxn; +use memmap2::Mmap; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; + +use super::super::document_change::DocumentChange; +use super::super::items_pool::ItemsPool; +use super::Indexer; +use crate::documents::{ + obkv_to_object, DocumentIdExtractionError, DocumentsBatchReader, PrimaryKey, +}; +use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update}; +use crate::update::{AvailableIds, IndexDocumentsMethod}; +use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; + +pub struct DocumentOperationIndexer { + pub(crate) operations: Vec, + pub(crate) index_documents_method: IndexDocumentsMethod, +} + +pub enum Payload { + Addition(File), + Deletion(Vec), +} + +pub struct PayloadStats { + pub document_count: usize, + pub bytes: u64, +} + +pub enum DocumentOperation { + Addition(DocumentOffset), + Deletion, +} + +/// Represents an offset where a document lives +/// in an mmapped grenad reader file. +pub struct DocumentOffset { + /// The mmapped grenad reader file. + pub content: Arc, // grenad::Reader + /// The offset of the document in the file. + pub offset: u32, +} + +impl DocumentOperationIndexer { + pub fn new(method: IndexDocumentsMethod) -> Self { + Self { operations: Default::default(), index_documents_method: method } + } + + /// TODO please give me a type + /// The payload is expected to be in the grenad format + pub fn add_documents(&mut self, payload: File) -> Result { + let reader = DocumentsBatchReader::from_reader(&payload)?; + let bytes = payload.metadata()?.len(); + let document_count = reader.documents_count() as usize; + + self.operations.push(Payload::Addition(payload)); + + Ok(PayloadStats { bytes, document_count }) + } + + pub fn delete_documents(&mut self, to_delete: Vec) { + self.operations.push(Payload::Deletion(to_delete)) + } +} + +impl<'p> Indexer<'p> for DocumentOperationIndexer { + type Parameter = (&'p Index, &'p RoTxn<'static>, &'p mut FieldsIdsMap, &'p PrimaryKey<'p>); + + fn document_changes( + self, + param: Self::Parameter, + ) -> Result>> + 'p> { + let (index, rtxn, fields_ids_map, primary_key) = param; + + let documents_ids = index.documents_ids(rtxn)?; + let mut available_docids = AvailableIds::new(&documents_ids); + let mut docids_version_offsets = HashMap::::new(); + + for operation in self.operations { + match operation { + Payload::Addition(payload) => { + let content = unsafe { Mmap::map(&payload).map(Arc::new)? }; + let cursor = Cursor::new(content.as_ref()); + let reader = DocumentsBatchReader::from_reader(cursor)?; + + let (mut batch_cursor, batch_index) = reader.into_cursor_and_fields_index(); + // TODO Fetch all document fields to fill the fields ids map + batch_index.iter().for_each(|(_, name)| { + fields_ids_map.insert(name); + }); + + let mut offset: u32 = 0; + while let Some(document) = batch_cursor.next_document()? { + let external_document_id = + match primary_key.document_id(document, &batch_index)? { + Ok(document_id) => Ok(document_id), + Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => { + Err(user_error) + } + Err(DocumentIdExtractionError::MissingDocumentId) => { + Err(UserError::MissingDocumentId { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(document, &batch_index)?, + }) + } + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(document, &batch_index)?, + }) + } + }?; + + let content = content.clone(); + let document_offset = DocumentOffset { content, offset }; + let document_operation = DocumentOperation::Addition(document_offset); + + match docids_version_offsets.get_mut(&external_document_id) { + None => { + let docid = match index + .external_documents_ids() + .get(rtxn, &external_document_id)? + { + Some(docid) => docid, + None => available_docids + .next() + .ok_or(Error::UserError(UserError::DocumentLimitReached))?, + }; + + docids_version_offsets.insert( + external_document_id, + (docid, vec![document_operation]), + ); + } + Some((_, offsets)) => offsets.push(document_operation), + } + offset += 1; + } + } + Payload::Deletion(to_delete) => { + for external_document_id in to_delete { + match docids_version_offsets.get_mut(&external_document_id) { + None => { + let docid = match index + .external_documents_ids() + .get(rtxn, &external_document_id)? + { + Some(docid) => docid, + None => available_docids + .next() + .ok_or(Error::UserError(UserError::DocumentLimitReached))?, + }; + + docids_version_offsets.insert( + external_document_id, + (docid, vec![DocumentOperation::Deletion]), + ); + } + Some((_, offsets)) => offsets.push(DocumentOperation::Deletion), + } + } + } + } + } + + Ok(docids_version_offsets.into_par_iter().map_with( + Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))), + move |context_pool, (external_docid, (internal_docid, operations))| { + context_pool.with(|rtxn| { + use IndexDocumentsMethod as Idm; + let document_merge_function = match self.index_documents_method { + Idm::ReplaceDocuments => merge_document_for_replacements, + Idm::UpdateDocuments => merge_document_for_updates, + }; + + document_merge_function( + rtxn, + index, + fields_ids_map, + internal_docid, + external_docid, + &operations, + ) + }) + }, + )) + } +} + +/// Reads the previous version of a document from the database, the new versions +/// in the grenad update files and merges them to generate a new boxed obkv. +/// +/// This function is only meant to be used when doing an update and not a replacement. +fn merge_document_for_updates( + rtxn: &RoTxn, + index: &Index, + fields_ids_map: &FieldsIdsMap, + docid: DocumentId, + external_docid: String, + operations: &[DocumentOperation], +) -> Result> { + let mut document = BTreeMap::<_, Cow<_>>::new(); + let current = index.documents.remap_data_type::().get(rtxn, &docid)?; + let current: Option<&KvReaderFieldId> = current.map(Into::into); + + if let Some(current) = current { + current.into_iter().for_each(|(k, v)| { + document.insert(k, v.into()); + }); + } + + let last_deletion = + operations.iter().rposition(|operation| matches!(operation, DocumentOperation::Deletion)); + + let operations = &operations[last_deletion.map_or(0, |i| i + 1)..]; + + if operations.is_empty() { + match current { + Some(current) => { + return Ok(Some(DocumentChange::Deletion(Deletion::create( + docid, + external_docid, + current.boxed(), + )))); + } + None => return Ok(None), + } + } + + for operation in operations { + let DocumentOffset { content, offset } = match operation { + DocumentOperation::Addition(offset) => offset, + DocumentOperation::Deletion => { + unreachable!("Deletion in document operations") + } + }; + + let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; + let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); + let update = cursor.get(*offset)?.expect("must exists"); + + update.into_iter().for_each(|(k, v)| { + let field_name = batch_index.name(k).unwrap(); + let id = fields_ids_map.id(field_name).unwrap(); + document.insert(id, v.to_vec().into()); + }); + } + + let mut writer = KvWriterFieldId::memory(); + document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); + let new = writer.into_boxed(); + + match current { + Some(current) => { + let update = Update::create(docid, external_docid, current.boxed(), new); + Ok(Some(DocumentChange::Update(update))) + } + None => { + let insertion = Insertion::create(docid, external_docid, new); + Ok(Some(DocumentChange::Insertion(insertion))) + } + } +} + +/// Returns only the most recent version of a document based on the updates from the payloads. +/// +/// This function is only meant to be used when doing a replacement and not an update. +fn merge_document_for_replacements( + rtxn: &RoTxn, + index: &Index, + fields_ids_map: &FieldsIdsMap, + docid: DocumentId, + external_docid: String, + operations: &[DocumentOperation], +) -> Result> { + let current = index.documents.remap_data_type::().get(rtxn, &docid)?; + let current: Option<&KvReaderFieldId> = current.map(Into::into); + + match operations.last() { + Some(DocumentOperation::Addition(DocumentOffset { content, offset })) => { + let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; + let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); + let update = cursor.get(*offset)?.expect("must exists"); + + let mut document_entries = Vec::new(); + update.into_iter().for_each(|(k, v)| { + let field_name = batch_index.name(k).unwrap(); + let id = fields_ids_map.id(field_name).unwrap(); + document_entries.push((id, v)); + }); + + document_entries.sort_unstable_by_key(|(id, _)| *id); + + let mut writer = KvWriterFieldId::memory(); + document_entries.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); + let new = writer.into_boxed(); + + match current { + Some(current) => { + let update = Update::create(docid, external_docid, current.boxed(), new); + Ok(Some(DocumentChange::Update(update))) + } + None => { + let insertion = Insertion::create(docid, external_docid, new); + Ok(Some(DocumentChange::Insertion(insertion))) + } + } + } + Some(DocumentOperation::Deletion) => match current { + Some(current) => { + let deletion = Deletion::create(docid, external_docid, current.boxed()); + Ok(Some(DocumentChange::Deletion(deletion))) + } + None => Ok(None), + }, + None => Ok(None), + } +} diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs new file mode 100644 index 000000000..998793b49 --- /dev/null +++ b/milli/src/update/new/indexer/mod.rs @@ -0,0 +1,82 @@ +use std::thread; + +use big_s::S; +pub use document_deletion::DocumentDeletionIndexer; +pub use document_operation::DocumentOperationIndexer; +use heed::RwTxn; +pub use partial_dump::PartialDumpIndexer; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use rayon::ThreadPool; +pub use update_by_function::UpdateByFunctionIndexer; + +use super::channel::{ + extractors_merger_channels, merger_writer_channels, EntryOperation, ExtractorsMergerChannels, + WriterOperation, +}; +use super::document_change::DocumentChange; +use super::merger::merge_grenad_entries; +use crate::{Index, Result}; + +mod document_deletion; +mod document_operation; +mod partial_dump; +mod update_by_function; + +pub trait Indexer<'p> { + type Parameter: 'p; + + fn document_changes( + self, + param: Self::Parameter, + ) -> Result>> + 'p>; +} + +/// This is the main function of this crate. +/// +/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. +/// +/// TODO return stats +/// TODO take the rayon ThreadPool +pub fn index( + wtxn: &mut RwTxn, + index: &Index, + pool: &ThreadPool, + document_changes: PI, +) -> Result<()> +where + PI: IntoParallelIterator> + Send, + PI::Iter: Clone, +{ + let (merger_sender, writer_receiver) = merger_writer_channels(100); + let ExtractorsMergerChannels { merger_receiver, deladd_cbo_roaring_bitmap_sender } = + extractors_merger_channels(100); + + thread::scope(|s| { + // TODO manage the errors correctly + thread::Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { + pool.in_place_scope(|_s| { + document_changes.into_par_iter().for_each(|_dc| ()); + }) + })?; + + // TODO manage the errors correctly + thread::Builder::new().name(S("indexer-merger")).spawn_scoped(s, || { + let rtxn = index.read_txn().unwrap(); + merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index).unwrap() + })?; + + // TODO Split this code into another function + for operation in writer_receiver { + let database = operation.database(index); + match operation { + WriterOperation::WordDocids(operation) => match operation { + EntryOperation::Delete(e) => database.delete(wtxn, e.entry()).map(drop)?, + EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, + }, + WriterOperation::Document(e) => database.put(wtxn, &e.key(), e.content())?, + } + } + + Ok(()) + }) +} diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs new file mode 100644 index 000000000..24ba70bcb --- /dev/null +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -0,0 +1,73 @@ +use rayon::iter::{ParallelBridge, ParallelIterator}; + +use super::Indexer; +use crate::documents::{DocumentIdExtractionError, PrimaryKey}; +use crate::update::concurrent_available_ids::ConcurrentAvailableIds; +use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId}; +use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; + +pub struct PartialDumpIndexer { + pub iter: I, +} + +impl PartialDumpIndexer { + pub fn new_from_jsonlines(iter: I) -> Self { + PartialDumpIndexer { iter } + } +} + +impl<'p, I> Indexer<'p> for PartialDumpIndexer +where + I: IntoIterator, + I::IntoIter: Send + 'p, + I::Item: Send, +{ + type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>); + + /// Note for future self: + /// - the field ids map must already be valid so you must have to generate it beforehand. + /// - We should probably expose another method that generates the fields ids map from an iterator of JSON objects. + /// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items). + fn document_changes( + self, + param: Self::Parameter, + ) -> Result>> + 'p> { + let (fields_ids_map, concurrent_available_ids, primary_key) = param; + + Ok(self.iter.into_iter().par_bridge().map(|object| { + let docid = match concurrent_available_ids.next() { + Some(id) => id, + None => return Err(Error::UserError(UserError::DocumentLimitReached)), + }; + + let mut writer = KvWriterFieldId::memory(); + object.iter().for_each(|(key, value)| { + let key = fields_ids_map.id(key).unwrap(); + /// TODO better error management + let value = serde_json::to_vec(&value).unwrap(); + writer.insert(key, value).unwrap(); + }); + + let document = writer.into_boxed(); + let external_docid = match primary_key.document_id(&document, fields_ids_map)? { + Ok(document_id) => Ok(document_id), + Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error), + Err(DocumentIdExtractionError::MissingDocumentId) => { + Err(UserError::MissingDocumentId { + primary_key: primary_key.name().to_string(), + document: all_obkv_to_json(&document, fields_ids_map)?, + }) + } + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: all_obkv_to_json(&document, fields_ids_map)?, + }) + } + }?; + + let insertion = Insertion::create(docid, external_docid, document); + Ok(Some(DocumentChange::Insertion(insertion))) + })) + } +} diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs new file mode 100644 index 000000000..c8c434b72 --- /dev/null +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -0,0 +1,18 @@ +use rayon::iter::{IntoParallelIterator, ParallelIterator}; + +use super::Indexer; +use crate::update::new::DocumentChange; +use crate::Result; + +pub struct UpdateByFunctionIndexer; + +impl<'p> Indexer<'p> for UpdateByFunctionIndexer { + type Parameter = (); + + fn document_changes( + self, + _param: Self::Parameter, + ) -> Result>> + 'p> { + Ok(vec![].into_par_iter()) + } +} diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs new file mode 100644 index 000000000..97f9e6ac6 --- /dev/null +++ b/milli/src/update/new/merger.rs @@ -0,0 +1,97 @@ +use heed::types::Bytes; +use heed::RoTxn; +use roaring::RoaringBitmap; + +use super::channel::{MergerReceiver, MergerSender}; +use super::KvReaderDelAdd; +use crate::update::del_add::DelAdd; +use crate::update::new::channel::MergerOperation; +use crate::update::MergeDeladdCboRoaringBitmaps; +use crate::{CboRoaringBitmapCodec, Index, Result}; + +/// TODO We must return some infos/stats +pub fn merge_grenad_entries( + receiver: MergerReceiver, + sender: MergerSender, + rtxn: &RoTxn, + index: &Index, +) -> Result<()> { + let mut buffer = Vec::new(); + + for merger_operation in receiver { + match merger_operation { + MergerOperation::WordDocidsCursors(cursors) => { + let sender = sender.word_docids(); + let database = index.word_docids.remap_types::(); + + let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); + builder.extend(cursors); + /// TODO manage the error correctly + let mut merger_iter = builder.build().into_stream_merger_iter().unwrap(); + + // TODO manage the error correctly + while let Some((key, deladd)) = merger_iter.next().unwrap() { + let current = database.get(rtxn, key)?; + let deladd: &KvReaderDelAdd = deladd.into(); + let del = deladd.get(DelAdd::Deletion); + let add = deladd.get(DelAdd::Addition); + + match merge_cbo_bitmaps(current, del, add)? { + Operation::Write(bitmap) => { + let value = cbo_serialize_into_vec(&bitmap, &mut buffer); + sender.write(key, value).unwrap(); + } + Operation::Delete => sender.delete(key).unwrap(), + Operation::Ignore => (), + } + } + } + } + } + + Ok(()) +} + +enum Operation { + Write(RoaringBitmap), + Delete, + Ignore, +} + +/// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap. +fn merge_cbo_bitmaps( + current: Option<&[u8]>, + del: Option<&[u8]>, + add: Option<&[u8]>, +) -> Result { + let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; + let del = del.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; + let add = add.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; + + match (current, del, add) { + (None, None, None) => Ok(Operation::Ignore), // but it's strange + (None, None, Some(add)) => Ok(Operation::Write(add)), + (None, Some(_del), None) => Ok(Operation::Ignore), // but it's strange + (None, Some(_del), Some(add)) => Ok(Operation::Write(add)), + (Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange + (Some(current), None, Some(add)) => Ok(Operation::Write(current | add)), + (Some(current), Some(del), add) => { + let output = match add { + Some(add) => (current - del) | add, + None => current - del, + }; + if output.is_empty() { + Ok(Operation::Delete) + } else { + Ok(Operation::Write(output)) + } + } + } +} + +/// Return the slice directly from the serialize_into method +fn cbo_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) -> &'b [u8] { + buffer.clear(); + CboRoaringBitmapCodec::serialize_into(bitmap, buffer); + buffer.as_slice() +} diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index a6ac38abe..830565368 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -1,610 +1,24 @@ +pub use document_change::{Deletion, DocumentChange, Insertion, Update}; +pub use indexer::{ + index, DocumentDeletionIndexer, DocumentOperationIndexer, PartialDumpIndexer, + UpdateByFunctionIndexer, +}; +pub use items_pool::ItemsPool; + +use super::del_add::DelAdd; +use crate::FieldId; + mod document_change; +mod merger; // mod extract; +// mod global_fields_ids_map; mod channel; +mod indexer; mod items_pool; -/// TODO remove this -// mod global_fields_ids_map; - +/// TODO move them elsewhere pub type StdResult = std::result::Result; - -mod indexer { - use std::borrow::Cow; - use std::collections::{BTreeMap, HashMap}; - use std::fs::File; - use std::io::Cursor; - use std::os::unix::fs::MetadataExt; - use std::sync::Arc; - use std::thread; - - use big_s::S; - use heed::types::Bytes; - use heed::{RoTxn, RwTxn}; - use memmap2::Mmap; - use obkv::KvWriter; - use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; - use rayon::ThreadPool; - use roaring::RoaringBitmap; - use serde_json::Value; - - use super::channel::{ - extractors_merger_channels, merger_writer_channels, EntryOperation, - ExtractorsMergerChannels, MergerReceiver, MergerSender, WriterOperation, - }; - use super::document_change::{Deletion, DocumentChange, Insertion, Update}; - use super::items_pool::ItemsPool; - use crate::documents::{ - obkv_to_object, DocumentIdExtractionError, DocumentsBatchReader, PrimaryKey, - }; - use crate::update::concurrent_available_ids::ConcurrentAvailableIds; - use crate::update::del_add::DelAdd; - use crate::update::new::channel::MergerOperation; - use crate::update::{AvailableIds, IndexDocumentsMethod, MergeDeladdCboRoaringBitmaps}; - use crate::{ - all_obkv_to_json, obkv_to_json, CboRoaringBitmapCodec, DocumentId, Error, FieldId, - FieldsIdsMap, Index, InternalError, Object, Result, UserError, - }; - - pub type KvReaderFieldId = obkv::KvReader; - pub type KvReaderDelAdd = obkv::KvReader; - pub type KvWriterFieldId = obkv::KvWriter; - pub type KvWriterDelAdd = obkv::KvWriter; - - pub struct DocumentOperationIndexer { - operations: Vec, - index_documents_method: IndexDocumentsMethod, - } - - enum Payload { - Addition(File), - Deletion(Vec), - } - - pub struct PayloadStats { - pub document_count: usize, - pub bytes: u64, - } - - enum DocumentOperation { - Addition(DocumentOffset), - Deletion, - } - - /// Represents an offset where a document lives - /// in an mmapped grenad reader file. - struct DocumentOffset { - /// The mmapped grenad reader file. - pub content: Arc, // grenad::Reader - /// The offset of the document in the file. - pub offset: u32, - } - - impl DocumentOperationIndexer { - pub fn new(method: IndexDocumentsMethod) -> Self { - Self { operations: Default::default(), index_documents_method: method } - } - - /// TODO please give me a type - /// The payload is expected to be in the grenad format - pub fn add_documents(&mut self, payload: File) -> Result { - let reader = DocumentsBatchReader::from_reader(&payload)?; - let bytes = payload.metadata()?.size(); - let document_count = reader.documents_count() as usize; - - self.operations.push(Payload::Addition(payload)); - - Ok(PayloadStats { bytes, document_count }) - } - - pub fn delete_documents(&mut self, to_delete: Vec) { - self.operations.push(Payload::Deletion(to_delete)) - } - - pub fn document_changes<'a>( - self, - index: &'a Index, - rtxn: &'a RoTxn, - fields_ids_map: &'a mut FieldsIdsMap, - primary_key: &'a PrimaryKey<'a>, - ) -> Result>> + 'a> { - let documents_ids = index.documents_ids(rtxn)?; - let mut available_docids = AvailableIds::new(&documents_ids); - let mut docids_version_offsets = HashMap::::new(); - - for operation in self.operations { - match operation { - Payload::Addition(payload) => { - let content = unsafe { Mmap::map(&payload).map(Arc::new)? }; - let cursor = Cursor::new(content.as_ref()); - let reader = DocumentsBatchReader::from_reader(cursor)?; - - let (mut batch_cursor, batch_index) = reader.into_cursor_and_fields_index(); - // TODO Fetch all document fields to fill the fields ids map - batch_index.iter().for_each(|(_, name)| { - fields_ids_map.insert(name); - }); - - let mut offset: u32 = 0; - while let Some(document) = batch_cursor.next_document()? { - let external_document_id = - match primary_key.document_id(document, &batch_index)? { - Ok(document_id) => Ok(document_id), - Err(DocumentIdExtractionError::InvalidDocumentId( - user_error, - )) => Err(user_error), - Err(DocumentIdExtractionError::MissingDocumentId) => { - Err(UserError::MissingDocumentId { - primary_key: primary_key.name().to_string(), - document: obkv_to_object(document, &batch_index)?, - }) - } - Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { - Err(UserError::TooManyDocumentIds { - primary_key: primary_key.name().to_string(), - document: obkv_to_object(document, &batch_index)?, - }) - } - }?; - - let content = content.clone(); - let document_offset = DocumentOffset { content, offset }; - let document_operation = DocumentOperation::Addition(document_offset); - - match docids_version_offsets.get_mut(&external_document_id) { - None => { - let docid = match index - .external_documents_ids() - .get(rtxn, &external_document_id)? - { - Some(docid) => docid, - None => available_docids.next().ok_or(Error::UserError( - UserError::DocumentLimitReached, - ))?, - }; - - docids_version_offsets.insert( - external_document_id, - (docid, vec![document_operation]), - ); - } - Some((_, offsets)) => offsets.push(document_operation), - } - offset += 1; - } - } - Payload::Deletion(to_delete) => { - for external_document_id in to_delete { - match docids_version_offsets.get_mut(&external_document_id) { - None => { - let docid = match index - .external_documents_ids() - .get(rtxn, &external_document_id)? - { - Some(docid) => docid, - None => available_docids.next().ok_or(Error::UserError( - UserError::DocumentLimitReached, - ))?, - }; - - docids_version_offsets.insert( - external_document_id, - (docid, vec![DocumentOperation::Deletion]), - ); - } - Some((_, offsets)) => offsets.push(DocumentOperation::Deletion), - } - } - } - } - } - - Ok(docids_version_offsets.into_par_iter().map_with( - Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))), - move |context_pool, (external_docid, (internal_docid, operations))| { - context_pool.with(|rtxn| { - use IndexDocumentsMethod as Idm; - let document_merge_function = match self.index_documents_method { - Idm::ReplaceDocuments => merge_document_for_replacements, - Idm::UpdateDocuments => merge_document_for_updates, - }; - - document_merge_function( - rtxn, - index, - fields_ids_map, - internal_docid, - external_docid, - &operations, - ) - }) - }, - )) - } - } - - pub struct DeleteDocumentIndexer { - to_delete: RoaringBitmap, - } - - impl DeleteDocumentIndexer { - pub fn new() -> Self { - Self { to_delete: Default::default() } - } - - pub fn delete_documents_by_docids(&mut self, docids: RoaringBitmap) { - self.to_delete |= docids; - } - - // let fields = index.fields_ids_map(rtxn)?; - // let primary_key = - // index.primary_key(rtxn)?.ok_or(InternalError::DatabaseMissingEntry { - // db_name: db_name::MAIN, - // key: Some(main_key::PRIMARY_KEY_KEY), - // })?; - // let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| { - // InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldName { - // field_name: primary_key.to_owned(), - // process: "external_id_of", - // }) - // })?; - pub fn document_changes<'a>( - self, - index: &'a Index, - fields: &'a FieldsIdsMap, - primary_key: &'a PrimaryKey<'a>, - ) -> Result> + 'a> { - let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); - Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| { - items.with(|rtxn| { - let current = index.document(rtxn, docid)?; - let external_docid = match primary_key.document_id(current, fields)? { - Ok(document_id) => Ok(document_id) as Result<_>, - Err(_) => Err(InternalError::DocumentsError( - crate::documents::Error::InvalidDocumentFormat, - ) - .into()), - }?; - - Ok(DocumentChange::Deletion(Deletion::create( - docid, - external_docid, - current.boxed(), - ))) - }) - })) - } - } - - pub struct PartialDumpIndexer { - iter: I, - } - - impl PartialDumpIndexer - where - I: IntoIterator, - I::IntoIter: Send, - I::Item: Send, - { - pub fn new_from_jsonlines(iter: I) -> Self { - PartialDumpIndexer { iter } - } - - /// Note for future self: - /// - the field ids map must already be valid so you must have to generate it beforehand. - /// - We should probably expose another method that generates the fields ids map from an iterator of JSON objects. - /// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items). - pub fn document_changes<'a>( - self, - fields_ids_map: &'a FieldsIdsMap, - concurrent_available_ids: &'a ConcurrentAvailableIds, - primary_key: &'a PrimaryKey<'a>, - ) -> impl ParallelIterator>> + 'a - where - // I don't like this, it will not fit in the future trait easily - I::IntoIter: 'a, - { - self.iter.into_iter().par_bridge().map(|object| { - let docid = match concurrent_available_ids.next() { - Some(id) => id, - None => return Err(Error::UserError(UserError::DocumentLimitReached)), - }; - - let mut writer = KvWriterFieldId::memory(); - object.iter().for_each(|(key, value)| { - let key = fields_ids_map.id(key).unwrap(); - /// TODO better error management - let value = serde_json::to_vec(&value).unwrap(); - writer.insert(key, value).unwrap(); - }); - - let document = writer.into_boxed(); - let external_docid = match primary_key.document_id(&document, fields_ids_map)? { - Ok(document_id) => Ok(document_id), - Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => { - Err(user_error) - } - Err(DocumentIdExtractionError::MissingDocumentId) => { - Err(UserError::MissingDocumentId { - primary_key: primary_key.name().to_string(), - document: all_obkv_to_json(&document, fields_ids_map)?, - }) - } - Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { - Err(UserError::TooManyDocumentIds { - primary_key: primary_key.name().to_string(), - document: all_obkv_to_json(&document, fields_ids_map)?, - }) - } - }?; - - let insertion = Insertion::create(docid, external_docid, document); - Ok(Some(DocumentChange::Insertion(insertion))) - }) - } - } - - pub struct UpdateByFunctionIndexer; - - /// TODO return stats - /// TODO take the rayon ThreadPool - pub fn index( - wtxn: &mut RwTxn, - index: &Index, - pool: &ThreadPool, - document_changes: PI, - ) -> Result<()> - where - PI: IntoParallelIterator> + Send, - PI::Iter: Clone, - { - let (merger_sender, writer_receiver) = merger_writer_channels(100); - let ExtractorsMergerChannels { merger_receiver, deladd_cbo_roaring_bitmap_sender } = - extractors_merger_channels(100); - - thread::scope(|s| { - thread::Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { - pool.in_place_scope(|_s| { - document_changes.into_par_iter().for_each(|_dc| ()); - }) - })?; - - // TODO manage the errors correctly - thread::Builder::new().name(S("indexer-merger")).spawn_scoped(s, || { - let rtxn = index.read_txn().unwrap(); - merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index).unwrap() - })?; - - // TODO Split this code into another function - for operation in writer_receiver { - let database = operation.database(index); - match operation { - WriterOperation::WordDocids(operation) => match operation { - EntryOperation::Delete(e) => database.delete(wtxn, e.entry()).map(drop)?, - EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, - }, - WriterOperation::Document(e) => database.put(wtxn, &e.key(), e.content())?, - } - } - - Ok(()) - }) - } - - enum Operation { - Write(RoaringBitmap), - Delete, - Ignore, - } - - /// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap. - fn merge_cbo_bitmaps( - current: Option<&[u8]>, - del: Option<&[u8]>, - add: Option<&[u8]>, - ) -> Result { - let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; - let del = del.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; - let add = add.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; - - match (current, del, add) { - (None, None, None) => Ok(Operation::Ignore), // but it's strange - (None, None, Some(add)) => Ok(Operation::Write(add)), - (None, Some(_del), None) => Ok(Operation::Ignore), // but it's strange - (None, Some(_del), Some(add)) => Ok(Operation::Write(add)), - (Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange - (Some(current), None, Some(add)) => Ok(Operation::Write(current | add)), - (Some(current), Some(del), add) => { - let output = match add { - Some(add) => (current - del) | add, - None => current - del, - }; - if output.is_empty() { - Ok(Operation::Delete) - } else { - Ok(Operation::Write(output)) - } - } - } - } - - /// Return the slice directly from the serialize_into method - fn cbo_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) -> &'b [u8] { - buffer.clear(); - CboRoaringBitmapCodec::serialize_into(bitmap, buffer); - buffer.as_slice() - } - - /// TODO We must return some infos/stats - fn merge_grenad_entries( - receiver: MergerReceiver, - sender: MergerSender, - rtxn: &RoTxn, - index: &Index, - ) -> Result<()> { - let mut buffer = Vec::new(); - - for merger_operation in receiver { - match merger_operation { - MergerOperation::WordDocidsCursors(cursors) => { - let sender = sender.word_docids(); - let database = index.word_docids.remap_types::(); - - let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); - builder.extend(cursors); - /// TODO manage the error correctly - let mut merger_iter = builder.build().into_stream_merger_iter().unwrap(); - - // TODO manage the error correctly - while let Some((key, deladd)) = merger_iter.next().unwrap() { - let current = database.get(rtxn, key)?; - let deladd: &KvReaderDelAdd = deladd.into(); - let del = deladd.get(DelAdd::Deletion); - let add = deladd.get(DelAdd::Addition); - - match merge_cbo_bitmaps(current, del, add)? { - Operation::Write(bitmap) => { - let value = cbo_serialize_into_vec(&bitmap, &mut buffer); - sender.write(key, value).unwrap(); - } - Operation::Delete => sender.delete(key).unwrap(), - Operation::Ignore => (), - } - } - } - } - } - - Ok(()) - } - - /// Reads the previous version of a document from the database, the new versions - /// in the grenad update files and merges them to generate a new boxed obkv. - /// - /// This function is only meant to be used when doing an update and not a replacement. - fn merge_document_for_updates( - rtxn: &RoTxn, - index: &Index, - fields_ids_map: &FieldsIdsMap, - docid: DocumentId, - external_docid: String, - operations: &[DocumentOperation], - ) -> Result> { - let mut document = BTreeMap::<_, Cow<_>>::new(); - let current = index.documents.remap_data_type::().get(rtxn, &docid)?; - let current: Option<&KvReaderFieldId> = current.map(Into::into); - - if let Some(current) = current { - current.into_iter().for_each(|(k, v)| { - document.insert(k, v.into()); - }); - } - - let last_deletion = operations - .iter() - .rposition(|operation| matches!(operation, DocumentOperation::Deletion)); - - let operations = &operations[last_deletion.map_or(0, |i| i + 1)..]; - - if operations.is_empty() { - match current { - Some(current) => { - return Ok(Some(DocumentChange::Deletion(Deletion::create( - docid, - external_docid, - current.boxed(), - )))); - } - None => return Ok(None), - } - } - - for operation in operations { - let DocumentOffset { content, offset } = match operation { - DocumentOperation::Addition(offset) => offset, - DocumentOperation::Deletion => unreachable!("Deletion in document operations"), - }; - - let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; - let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); - let update = cursor.get(*offset)?.expect("must exists"); - - update.into_iter().for_each(|(k, v)| { - let field_name = batch_index.name(k).unwrap(); - let id = fields_ids_map.id(field_name).unwrap(); - document.insert(id, v.to_vec().into()); - }); - } - - let mut writer = KvWriterFieldId::memory(); - document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); - let new = writer.into_boxed(); - - match current { - Some(current) => { - let update = Update::create(docid, external_docid, current.boxed(), new); - Ok(Some(DocumentChange::Update(update))) - } - None => { - let insertion = Insertion::create(docid, external_docid, new); - Ok(Some(DocumentChange::Insertion(insertion))) - } - } - } - - /// Returns only the most recent version of a document based on the updates from the payloads. - /// - /// This function is only meant to be used when doing a replacement and not an update. - fn merge_document_for_replacements( - rtxn: &RoTxn, - index: &Index, - fields_ids_map: &FieldsIdsMap, - docid: DocumentId, - external_docid: String, - operations: &[DocumentOperation], - ) -> Result> { - let current = index.documents.remap_data_type::().get(rtxn, &docid)?; - let current: Option<&KvReaderFieldId> = current.map(Into::into); - - match operations.last() { - Some(DocumentOperation::Addition(DocumentOffset { content, offset })) => { - let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; - let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); - let update = cursor.get(*offset)?.expect("must exists"); - - let mut document_entries = Vec::new(); - update.into_iter().for_each(|(k, v)| { - let field_name = batch_index.name(k).unwrap(); - let id = fields_ids_map.id(field_name).unwrap(); - document_entries.push((id, v)); - }); - - document_entries.sort_unstable_by_key(|(id, _)| *id); - - let mut writer = KvWriterFieldId::memory(); - document_entries - .into_iter() - .for_each(|(id, value)| writer.insert(id, value).unwrap()); - let new = writer.into_boxed(); - - match current { - Some(current) => { - let update = Update::create(docid, external_docid, current.boxed(), new); - Ok(Some(DocumentChange::Update(update))) - } - None => { - let insertion = Insertion::create(docid, external_docid, new); - Ok(Some(DocumentChange::Insertion(insertion))) - } - } - } - Some(DocumentOperation::Deletion) => match current { - Some(current) => { - let deletion = Deletion::create(docid, external_docid, current.boxed()); - Ok(Some(DocumentChange::Deletion(deletion))) - } - None => Ok(None), - }, - None => Ok(None), - } - } -} +pub type KvReaderDelAdd = obkv::KvReader; +pub type KvReaderFieldId = obkv::KvReader; +pub type KvWriterDelAdd = obkv::KvWriter; +pub type KvWriterFieldId = obkv::KvWriter; From 6526ce12084ace10e9fe93b70a5fc82b84697bea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Sep 2024 14:41:20 +0200 Subject: [PATCH 016/247] Fix the merging of documents --- .../update/new/indexer/document_operation.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index ba93915ea..80e7de51a 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -210,17 +210,22 @@ fn merge_document_for_updates( let current = index.documents.remap_data_type::().get(rtxn, &docid)?; let current: Option<&KvReaderFieldId> = current.map(Into::into); - if let Some(current) = current { - current.into_iter().for_each(|(k, v)| { - document.insert(k, v.into()); - }); + if operations.is_empty() { + return Ok(None); // but it's strange } - let last_deletion = - operations.iter().rposition(|operation| matches!(operation, DocumentOperation::Deletion)); - + let last_deletion = operations.iter().rposition(|op| matches!(op, InnerDocOp::Deletion)); let operations = &operations[last_deletion.map_or(0, |i| i + 1)..]; + // If there was a deletion we must not start + // from the original document but from scratch. + if last_deletion.is_none() { + if let Some(current) = current { + current.into_iter().for_each(|(k, v)| { + document.insert(k, v.into()); + }); + } + } if operations.is_empty() { match current { Some(current) => { From 72e7b7846e2926afa987c8e957b92cbba04ba6c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Sep 2024 14:42:27 +0200 Subject: [PATCH 017/247] Renaming the indexers --- .../update/new/indexer/document_deletion.rs | 6 +-- .../update/new/indexer/document_operation.rs | 40 +++++++++---------- milli/src/update/new/indexer/mod.rs | 8 ++-- milli/src/update/new/indexer/partial_dump.rs | 9 +++-- .../update/new/indexer/update_by_function.rs | 5 ++- milli/src/update/new/mod.rs | 6 +-- 6 files changed, 35 insertions(+), 39 deletions(-) diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index 24ba0c671..2b4bdaeb7 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -8,11 +8,11 @@ use crate::documents::PrimaryKey; use crate::update::new::{Deletion, DocumentChange, ItemsPool}; use crate::{FieldsIdsMap, Index, InternalError, Result}; -pub struct DocumentDeletionIndexer { +pub struct DocumentDeletion { pub to_delete: RoaringBitmap, } -impl DocumentDeletionIndexer { +impl DocumentDeletion { pub fn new() -> Self { Self { to_delete: Default::default() } } @@ -22,7 +22,7 @@ impl DocumentDeletionIndexer { } } -impl<'p> Indexer<'p> for DocumentDeletionIndexer { +impl<'p> Indexer<'p> for DocumentDeletion { type Parameter = (&'p Index, &'p FieldsIdsMap, &'p PrimaryKey<'p>); fn document_changes( diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 80e7de51a..fdcb84c7b 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -19,9 +19,9 @@ use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; -pub struct DocumentOperationIndexer { - pub(crate) operations: Vec, - pub(crate) index_documents_method: IndexDocumentsMethod, +pub struct DocumentOperation { + operations: Vec, + index_documents_method: IndexDocumentsMethod, } pub enum Payload { @@ -34,7 +34,7 @@ pub struct PayloadStats { pub bytes: u64, } -pub enum DocumentOperation { +enum InnerDocOp { Addition(DocumentOffset), Deletion, } @@ -48,7 +48,7 @@ pub struct DocumentOffset { pub offset: u32, } -impl DocumentOperationIndexer { +impl DocumentOperation { pub fn new(method: IndexDocumentsMethod) -> Self { Self { operations: Default::default(), index_documents_method: method } } @@ -70,7 +70,7 @@ impl DocumentOperationIndexer { } } -impl<'p> Indexer<'p> for DocumentOperationIndexer { +impl<'p> Indexer<'p> for DocumentOperation { type Parameter = (&'p Index, &'p RoTxn<'static>, &'p mut FieldsIdsMap, &'p PrimaryKey<'p>); fn document_changes( @@ -120,7 +120,7 @@ impl<'p> Indexer<'p> for DocumentOperationIndexer { let content = content.clone(); let document_offset = DocumentOffset { content, offset }; - let document_operation = DocumentOperation::Addition(document_offset); + let document_operation = InnerDocOp::Addition(document_offset); match docids_version_offsets.get_mut(&external_document_id) { None => { @@ -160,10 +160,10 @@ impl<'p> Indexer<'p> for DocumentOperationIndexer { docids_version_offsets.insert( external_document_id, - (docid, vec![DocumentOperation::Deletion]), + (docid, vec![InnerDocOp::Deletion]), ); } - Some((_, offsets)) => offsets.push(DocumentOperation::Deletion), + Some((_, offsets)) => offsets.push(InnerDocOp::Deletion), } } } @@ -204,7 +204,7 @@ fn merge_document_for_updates( fields_ids_map: &FieldsIdsMap, docid: DocumentId, external_docid: String, - operations: &[DocumentOperation], + operations: &[InnerDocOp], ) -> Result> { let mut document = BTreeMap::<_, Cow<_>>::new(); let current = index.documents.remap_data_type::().get(rtxn, &docid)?; @@ -226,14 +226,12 @@ fn merge_document_for_updates( }); } } + if operations.is_empty() { match current { Some(current) => { - return Ok(Some(DocumentChange::Deletion(Deletion::create( - docid, - external_docid, - current.boxed(), - )))); + let deletion = Deletion::create(docid, external_docid, current.boxed()); + return Ok(Some(DocumentChange::Deletion(deletion))); } None => return Ok(None), } @@ -241,8 +239,8 @@ fn merge_document_for_updates( for operation in operations { let DocumentOffset { content, offset } = match operation { - DocumentOperation::Addition(offset) => offset, - DocumentOperation::Deletion => { + InnerDocOp::Addition(offset) => offset, + InnerDocOp::Deletion => { unreachable!("Deletion in document operations") } }; @@ -283,13 +281,13 @@ fn merge_document_for_replacements( fields_ids_map: &FieldsIdsMap, docid: DocumentId, external_docid: String, - operations: &[DocumentOperation], + operations: &[InnerDocOp], ) -> Result> { let current = index.documents.remap_data_type::().get(rtxn, &docid)?; let current: Option<&KvReaderFieldId> = current.map(Into::into); match operations.last() { - Some(DocumentOperation::Addition(DocumentOffset { content, offset })) => { + Some(InnerDocOp::Addition(DocumentOffset { content, offset })) => { let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); let update = cursor.get(*offset)?.expect("must exists"); @@ -318,13 +316,13 @@ fn merge_document_for_replacements( } } } - Some(DocumentOperation::Deletion) => match current { + Some(InnerDocOp::Deletion) => match current { Some(current) => { let deletion = Deletion::create(docid, external_docid, current.boxed()); Ok(Some(DocumentChange::Deletion(deletion))) } None => Ok(None), }, - None => Ok(None), + None => Ok(None), // but it's strange } } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 998793b49..85d4dbcb1 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -1,13 +1,13 @@ use std::thread; use big_s::S; -pub use document_deletion::DocumentDeletionIndexer; -pub use document_operation::DocumentOperationIndexer; +pub use document_deletion::DocumentDeletion; +pub use document_operation::DocumentOperation; use heed::RwTxn; -pub use partial_dump::PartialDumpIndexer; +pub use partial_dump::PartialDump; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::ThreadPool; -pub use update_by_function::UpdateByFunctionIndexer; +pub use update_by_function::UpdateByFunction; use super::channel::{ extractors_merger_channels, merger_writer_channels, EntryOperation, ExtractorsMergerChannels, diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 24ba70bcb..7afb96d65 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -6,17 +6,17 @@ use crate::update::concurrent_available_ids::ConcurrentAvailableIds; use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId}; use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; -pub struct PartialDumpIndexer { +pub struct PartialDump { pub iter: I, } -impl PartialDumpIndexer { +impl PartialDump { pub fn new_from_jsonlines(iter: I) -> Self { - PartialDumpIndexer { iter } + PartialDump { iter } } } -impl<'p, I> Indexer<'p> for PartialDumpIndexer +impl<'p, I> Indexer<'p> for PartialDump where I: IntoIterator, I::IntoIter: Send + 'p, @@ -45,6 +45,7 @@ where let key = fields_ids_map.id(key).unwrap(); /// TODO better error management let value = serde_json::to_vec(&value).unwrap(); + /// TODO it is not ordered writer.insert(key, value).unwrap(); }); diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index c8c434b72..e9bdf3640 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -4,15 +4,16 @@ use super::Indexer; use crate::update::new::DocumentChange; use crate::Result; -pub struct UpdateByFunctionIndexer; +pub struct UpdateByFunction; -impl<'p> Indexer<'p> for UpdateByFunctionIndexer { +impl<'p> Indexer<'p> for UpdateByFunction { type Parameter = (); fn document_changes( self, _param: Self::Parameter, ) -> Result>> + 'p> { + todo!(); Ok(vec![].into_par_iter()) } } diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 830565368..cd94bd5d2 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -1,8 +1,4 @@ pub use document_change::{Deletion, DocumentChange, Insertion, Update}; -pub use indexer::{ - index, DocumentDeletionIndexer, DocumentOperationIndexer, PartialDumpIndexer, - UpdateByFunctionIndexer, -}; pub use items_pool::ItemsPool; use super::del_add::DelAdd; @@ -13,7 +9,7 @@ mod merger; // mod extract; // mod global_fields_ids_map; mod channel; -mod indexer; +pub mod indexer; mod items_pool; /// TODO move them elsewhere From 521775f788aa8dd65b14f3e0b18ce48e367b4fdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Sep 2024 15:10:21 +0200 Subject: [PATCH 018/247] I push for Many --- milli/src/update/new/channel.rs | 2 +- .../update/new/indexer/document_deletion.rs | 4 +-- .../update/new/indexer/document_operation.rs | 4 +-- milli/src/update/new/indexer/mod.rs | 33 ++++++++++++------- milli/src/update/new/indexer/partial_dump.rs | 4 +-- .../update/new/indexer/update_by_function.rs | 4 +-- 6 files changed, 30 insertions(+), 21 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 088303fb3..4041fcc6a 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -8,7 +8,7 @@ use crate::update::new::KvReaderFieldId; use crate::{DocumentId, Index}; /// The capacity of the channel is currently in number of messages. -pub fn merger_writer_channels(cap: usize) -> (MergerSender, WriterReceiver) { +pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) { let (sender, receiver) = crossbeam_channel::bounded(cap); (MergerSender(sender), WriterReceiver(receiver)) } diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index 2b4bdaeb7..c16299e9a 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use rayon::iter::{ParallelBridge, ParallelIterator}; use roaring::RoaringBitmap; -use super::Indexer; +use super::DocumentChanges; use crate::documents::PrimaryKey; use crate::update::new::{Deletion, DocumentChange, ItemsPool}; use crate::{FieldsIdsMap, Index, InternalError, Result}; @@ -22,7 +22,7 @@ impl DocumentDeletion { } } -impl<'p> Indexer<'p> for DocumentDeletion { +impl<'p> DocumentChanges<'p> for DocumentDeletion { type Parameter = (&'p Index, &'p FieldsIdsMap, &'p PrimaryKey<'p>); fn document_changes( diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index fdcb84c7b..5d9755211 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -11,7 +11,7 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use super::super::document_change::DocumentChange; use super::super::items_pool::ItemsPool; -use super::Indexer; +use super::DocumentChanges; use crate::documents::{ obkv_to_object, DocumentIdExtractionError, DocumentsBatchReader, PrimaryKey, }; @@ -70,7 +70,7 @@ impl DocumentOperation { } } -impl<'p> Indexer<'p> for DocumentOperation { +impl<'p> DocumentChanges<'p> for DocumentOperation { type Parameter = (&'p Index, &'p RoTxn<'static>, &'p mut FieldsIdsMap, &'p PrimaryKey<'p>); fn document_changes( diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 85d4dbcb1..69ccc0451 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -10,7 +10,7 @@ use rayon::ThreadPool; pub use update_by_function::UpdateByFunction; use super::channel::{ - extractors_merger_channels, merger_writer_channels, EntryOperation, ExtractorsMergerChannels, + extractors_merger_channels, merger_writer_channel, EntryOperation, ExtractorsMergerChannels, WriterOperation, }; use super::document_change::DocumentChange; @@ -22,7 +22,7 @@ mod document_operation; mod partial_dump; mod update_by_function; -pub trait Indexer<'p> { +pub trait DocumentChanges<'p> { type Parameter: 'p; fn document_changes( @@ -36,7 +36,6 @@ pub trait Indexer<'p> { /// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. /// /// TODO return stats -/// TODO take the rayon ThreadPool pub fn index( wtxn: &mut RwTxn, index: &Index, @@ -44,25 +43,31 @@ pub fn index( document_changes: PI, ) -> Result<()> where - PI: IntoParallelIterator> + Send, + PI: IntoParallelIterator>> + Send, PI::Iter: Clone, { - let (merger_sender, writer_receiver) = merger_writer_channels(100); + let (merger_sender, writer_receiver) = merger_writer_channel(100); let ExtractorsMergerChannels { merger_receiver, deladd_cbo_roaring_bitmap_sender } = extractors_merger_channels(100); thread::scope(|s| { // TODO manage the errors correctly - thread::Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { - pool.in_place_scope(|_s| { - document_changes.into_par_iter().for_each(|_dc| ()); - }) - })?; + let handle = + thread::Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { + pool.in_place_scope(|_s| { + // word docids + // document_changes.into_par_iter().try_for_each(|_dc| Ok(()) as Result<_>) + // let grenads = extractor_function(document_changes)?; + // deladd_cbo_roaring_bitmap_sender.word_docids(grenads)?; + + Ok(()) as Result<_> + }) + })?; // TODO manage the errors correctly - thread::Builder::new().name(S("indexer-merger")).spawn_scoped(s, || { + let handle2 = thread::Builder::new().name(S("indexer-merger")).spawn_scoped(s, || { let rtxn = index.read_txn().unwrap(); - merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index).unwrap() + merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index) })?; // TODO Split this code into another function @@ -77,6 +82,10 @@ where } } + /// TODO handle the panicking threads + handle.join().unwrap()?; + handle2.join().unwrap()?; + Ok(()) }) } diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 7afb96d65..11c9fbd0e 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -1,6 +1,6 @@ use rayon::iter::{ParallelBridge, ParallelIterator}; -use super::Indexer; +use super::DocumentChanges; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId}; @@ -16,7 +16,7 @@ impl PartialDump { } } -impl<'p, I> Indexer<'p> for PartialDump +impl<'p, I> DocumentChanges<'p> for PartialDump where I: IntoIterator, I::IntoIter: Send + 'p, diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index e9bdf3640..035f95c02 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -1,12 +1,12 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; -use super::Indexer; +use super::DocumentChanges; use crate::update::new::DocumentChange; use crate::Result; pub struct UpdateByFunction; -impl<'p> Indexer<'p> for UpdateByFunction { +impl<'p> DocumentChanges<'p> for UpdateByFunction { type Parameter = (); fn document_changes( From ab01679a8f3e1849a862a21ea3f681df0900d7d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Sep 2024 15:21:00 +0200 Subject: [PATCH 019/247] Remove the useless option from the document changes --- .../update/new/indexer/document_deletion.rs | 6 +- .../update/new/indexer/document_operation.rs | 155 +++++++++--------- milli/src/update/new/indexer/mod.rs | 27 ++- milli/src/update/new/indexer/partial_dump.rs | 4 +- .../update/new/indexer/update_by_function.rs | 2 +- 5 files changed, 98 insertions(+), 96 deletions(-) diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index c16299e9a..5e43b5816 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -28,7 +28,7 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion { fn document_changes( self, param: Self::Parameter, - ) -> Result>> + 'p> { + ) -> Result> + 'p> { let (index, fields, primary_key) = param; let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| { @@ -42,11 +42,11 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion { .into()), }?; - Ok(Some(DocumentChange::Deletion(Deletion::create( + Ok(DocumentChange::Deletion(Deletion::create( docid, external_docid, current.boxed(), - )))) + ))) }) })) } diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 5d9755211..26228c354 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -76,7 +76,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { fn document_changes( self, param: Self::Parameter, - ) -> Result>> + 'p> { + ) -> Result> + 'p> { let (index, rtxn, fields_ids_map, primary_key) = param; let documents_ids = index.documents_ids(rtxn)?; @@ -170,27 +170,85 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { } } - Ok(docids_version_offsets.into_par_iter().map_with( - Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))), - move |context_pool, (external_docid, (internal_docid, operations))| { - context_pool.with(|rtxn| { - use IndexDocumentsMethod as Idm; - let document_merge_function = match self.index_documents_method { - Idm::ReplaceDocuments => merge_document_for_replacements, - Idm::UpdateDocuments => merge_document_for_updates, - }; + Ok(docids_version_offsets + .into_par_iter() + .map_with( + Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))), + move |context_pool, (external_docid, (internal_docid, operations))| { + context_pool.with(|rtxn| { + use IndexDocumentsMethod as Idm; + let document_merge_function = match self.index_documents_method { + Idm::ReplaceDocuments => merge_document_for_replacements, + Idm::UpdateDocuments => merge_document_for_updates, + }; - document_merge_function( - rtxn, - index, - fields_ids_map, - internal_docid, - external_docid, - &operations, - ) - }) - }, - )) + document_merge_function( + rtxn, + index, + fields_ids_map, + internal_docid, + external_docid, + &operations, + ) + }) + }, + ) + .filter_map(Result::transpose)) + } +} + +/// Returns only the most recent version of a document based on the updates from the payloads. +/// +/// This function is only meant to be used when doing a replacement and not an update. +fn merge_document_for_replacements( + rtxn: &RoTxn, + index: &Index, + fields_ids_map: &FieldsIdsMap, + docid: DocumentId, + external_docid: String, + operations: &[InnerDocOp], +) -> Result> { + let current = index.documents.remap_data_type::().get(rtxn, &docid)?; + let current: Option<&KvReaderFieldId> = current.map(Into::into); + + match operations.last() { + Some(InnerDocOp::Addition(DocumentOffset { content, offset })) => { + let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; + let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); + let update = cursor.get(*offset)?.expect("must exists"); + + let mut document_entries = Vec::new(); + update.into_iter().for_each(|(k, v)| { + let field_name = batch_index.name(k).unwrap(); + let id = fields_ids_map.id(field_name).unwrap(); + document_entries.push((id, v)); + }); + + document_entries.sort_unstable_by_key(|(id, _)| *id); + + let mut writer = KvWriterFieldId::memory(); + document_entries.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); + let new = writer.into_boxed(); + + match current { + Some(current) => { + let update = Update::create(docid, external_docid, current.boxed(), new); + Ok(Some(DocumentChange::Update(update))) + } + None => { + let insertion = Insertion::create(docid, external_docid, new); + Ok(Some(DocumentChange::Insertion(insertion))) + } + } + } + Some(InnerDocOp::Deletion) => match current { + Some(current) => { + let deletion = Deletion::create(docid, external_docid, current.boxed()); + Ok(Some(DocumentChange::Deletion(deletion))) + } + None => Ok(None), + }, + None => Ok(None), // but it's strange } } @@ -271,58 +329,3 @@ fn merge_document_for_updates( } } } - -/// Returns only the most recent version of a document based on the updates from the payloads. -/// -/// This function is only meant to be used when doing a replacement and not an update. -fn merge_document_for_replacements( - rtxn: &RoTxn, - index: &Index, - fields_ids_map: &FieldsIdsMap, - docid: DocumentId, - external_docid: String, - operations: &[InnerDocOp], -) -> Result> { - let current = index.documents.remap_data_type::().get(rtxn, &docid)?; - let current: Option<&KvReaderFieldId> = current.map(Into::into); - - match operations.last() { - Some(InnerDocOp::Addition(DocumentOffset { content, offset })) => { - let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; - let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); - let update = cursor.get(*offset)?.expect("must exists"); - - let mut document_entries = Vec::new(); - update.into_iter().for_each(|(k, v)| { - let field_name = batch_index.name(k).unwrap(); - let id = fields_ids_map.id(field_name).unwrap(); - document_entries.push((id, v)); - }); - - document_entries.sort_unstable_by_key(|(id, _)| *id); - - let mut writer = KvWriterFieldId::memory(); - document_entries.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); - let new = writer.into_boxed(); - - match current { - Some(current) => { - let update = Update::create(docid, external_docid, current.boxed(), new); - Ok(Some(DocumentChange::Update(update))) - } - None => { - let insertion = Insertion::create(docid, external_docid, new); - Ok(Some(DocumentChange::Insertion(insertion))) - } - } - } - Some(InnerDocOp::Deletion) => match current { - Some(current) => { - let deletion = Deletion::create(docid, external_docid, current.boxed()); - Ok(Some(DocumentChange::Deletion(deletion))) - } - None => Ok(None), - }, - None => Ok(None), // but it's strange - } -} diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 69ccc0451..ba4356288 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -1,4 +1,4 @@ -use std::thread; +use std::thread::{self, Builder}; use big_s::S; pub use document_deletion::DocumentDeletion; @@ -28,7 +28,7 @@ pub trait DocumentChanges<'p> { fn document_changes( self, param: Self::Parameter, - ) -> Result>> + 'p>; + ) -> Result> + 'p>; } /// This is the main function of this crate. @@ -43,7 +43,7 @@ pub fn index( document_changes: PI, ) -> Result<()> where - PI: IntoParallelIterator>> + Send, + PI: IntoParallelIterator> + Send, PI::Iter: Clone, { let (merger_sender, writer_receiver) = merger_writer_channel(100); @@ -52,20 +52,19 @@ where thread::scope(|s| { // TODO manage the errors correctly - let handle = - thread::Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { - pool.in_place_scope(|_s| { - // word docids - // document_changes.into_par_iter().try_for_each(|_dc| Ok(()) as Result<_>) - // let grenads = extractor_function(document_changes)?; - // deladd_cbo_roaring_bitmap_sender.word_docids(grenads)?; + let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { + pool.in_place_scope(|_s| { + // word docids + // document_changes.into_par_iter().try_for_each(|_dc| Ok(()) as Result<_>) + // let grenads = extractor_function(document_changes)?; + // deladd_cbo_roaring_bitmap_sender.word_docids(grenads)?; - Ok(()) as Result<_> - }) - })?; + Ok(()) as Result<_> + }) + })?; // TODO manage the errors correctly - let handle2 = thread::Builder::new().name(S("indexer-merger")).spawn_scoped(s, || { + let handle2 = Builder::new().name(S("indexer-merger")).spawn_scoped(s, || { let rtxn = index.read_txn().unwrap(); merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index) })?; diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 11c9fbd0e..d324322a7 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -31,7 +31,7 @@ where fn document_changes( self, param: Self::Parameter, - ) -> Result>> + 'p> { + ) -> Result> + 'p> { let (fields_ids_map, concurrent_available_ids, primary_key) = param; Ok(self.iter.into_iter().par_bridge().map(|object| { @@ -68,7 +68,7 @@ where }?; let insertion = Insertion::create(docid, external_docid, document); - Ok(Some(DocumentChange::Insertion(insertion))) + Ok(DocumentChange::Insertion(insertion)) })) } } diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index 035f95c02..91e1fd4ee 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -12,7 +12,7 @@ impl<'p> DocumentChanges<'p> for UpdateByFunction { fn document_changes( self, _param: Self::Parameter, - ) -> Result>> + 'p> { + ) -> Result> + 'p> { todo!(); Ok(vec![].into_par_iter()) } From 9b7858fb90c3e3df3541c46672a89563006d5ddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Sep 2024 15:21:59 +0200 Subject: [PATCH 020/247] Expose the new indexer --- milli/src/update/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index c5e9272de..772a73236 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -18,7 +18,7 @@ pub(crate) mod del_add; pub(crate) mod facet; mod index_documents; mod indexer_config; -mod new; +pub mod new; mod settings; mod update_step; mod word_prefix_docids; From bcb1aa3d2294aa54d77e581f04b23ccde5cbb553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Sep 2024 19:39:48 +0200 Subject: [PATCH 021/247] Find a temporary solution to par into iter on an HashMap Spoiler: Do not use an HashMap but drain it into a Vec --- Cargo.lock | 3 +- Cargo.toml | 3 + index-scheduler/src/batch.rs | 156 ++++++++++-------- milli/src/fields_ids_map.rs | 2 + milli/src/fields_ids_map/global.rs | 84 ++++++++++ milli/src/update/new/global_fields_ids_map.rs | 65 -------- .../update/new/indexer/document_deletion.rs | 2 +- .../update/new/indexer/document_operation.rs | 12 +- milli/src/update/new/indexer/mod.rs | 66 +++++++- milli/src/update/new/indexer/partial_dump.rs | 6 +- .../update/new/indexer/update_by_function.rs | 5 +- milli/src/update/new/mod.rs | 2 +- 12 files changed, 254 insertions(+), 152 deletions(-) create mode 100644 milli/src/fields_ids_map/global.rs delete mode 100644 milli/src/update/new/global_fields_ids_map.rs diff --git a/Cargo.lock b/Cargo.lock index 281c0ab9d..e169dbd52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4657,8 +4657,7 @@ dependencies = [ [[package]] name = "roaring" version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f4b84ba6e838ceb47b41de5194a60244fac43d9fe03b71dbe8c5a201081d6d1" +source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#348e58c2312fc37c0f351373cc7338cea86cf828" dependencies = [ "bytemuck", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 0fbfa9b12..3b9219ebc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,3 +64,6 @@ opt-level = 3 opt-level = 3 [profile.bench.package.yada] opt-level = 3 + +[patch.crates-io] +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "clone-iter-slice" } diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 1a056dde9..6ec2b17bf 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -22,19 +22,21 @@ use std::ffi::OsStr; use std::fmt; use std::fs::{self, File}; use std::io::BufWriter; +use std::sync::RwLock; use dump::IndexMetadata; use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; -use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; +use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; +use meilisearch_types::milli::update::new::indexer::{self, guess_primary_key, DocumentChanges}; use meilisearch_types::milli::update::{ - IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, + self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; -use meilisearch_types::milli::{self, Filter, Object}; +use meilisearch_types::milli::{self, Filter, Object, UserError}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; @@ -1284,58 +1286,72 @@ impl IndexScheduler { let must_stop_processing = self.must_stop_processing.clone(); let indexer_config = self.index_mapper.indexer_config(); - if let Some(primary_key) = primary_key { - match index.primary_key(index_wtxn)? { - // if a primary key was set AND had already been defined in the index - // but to a different value, we can make the whole batch fail. - Some(pk) => { - if primary_key != pk { - return Err(milli::Error::from( - milli::UserError::PrimaryKeyCannotBeChanged(pk.to_string()), - ) - .into()); - } - } - // if the primary key was set and there was no primary key set for this index - // we set it to the received value before starting the indexing process. - None => { - let mut builder = - milli::update::Settings::new(index_wtxn, index, indexer_config); - builder.set_primary_key(primary_key); - builder.execute( - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.clone().get(), - )?; - primary_key_has_been_set = true; - } - } - } + /// TODO manage errors correctly + let rtxn = index.read_txn()?; + let first_addition_uuid = operations + .iter() + .find_map(|op| match op { + DocumentOperation::Add(content_uuid) => Some(content_uuid), + _ => None, + }) + .unwrap(); + let content_file = self.file_store.get_update(*first_addition_uuid)?; + let reader = + DocumentsBatchReader::from_reader(content_file).map_err(milli::Error::from)?; + let (cursor, documents_batch_index) = reader.into_cursor_and_fields_index(); + let primary_key = + guess_primary_key(&rtxn, index, cursor, &documents_batch_index)?.unwrap(); - let config = IndexDocumentsConfig { update_method: method, ..Default::default() }; + // if let Some(primary_key) = primary_key { + // match index.primary_key(index_wtxn)? { + // // if a primary key was set AND had already been defined in the index + // // but to a different value, we can make the whole batch fail. + // Some(pk) => { + // if primary_key != pk { + // return Err(milli::Error::from( + // milli::UserError::PrimaryKeyCannotBeChanged(pk.to_string()), + // ) + // .into()); + // } + // } + // // if the primary key was set and there was no primary key set for this index + // // we set it to the received value before starting the indexing process. + // None => { + // todo!(); + // let mut builder = + // milli::update::Settings::new(index_wtxn, index, indexer_config); + // builder.set_primary_key(primary_key); + // builder.execute( + // |indexing_step| tracing::debug!(update = ?indexing_step), + // || must_stop_processing.clone().get(), + // )?; + // primary_key_has_been_set = true; + // } + // } + // } - let embedder_configs = index.embedding_configs(index_wtxn)?; - // TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense) - let embedders = self.embedders(embedder_configs)?; + // let config = IndexDocumentsConfig { update_method: method, ..Default::default() }; - let mut builder = milli::update::IndexDocuments::new( - index_wtxn, - index, - indexer_config, - config, - |indexing_step| tracing::trace!(?indexing_step, "Update"), - || must_stop_processing.get(), - )?; + // let embedder_configs = index.embedding_configs(index_wtxn)?; + // // TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense) + // let embedders = self.embedders(embedder_configs)?; + // let mut builder = milli::update::IndexDocuments::new( + // index_wtxn, + // index, + // indexer_config, + // config, + // |indexing_step| tracing::trace!(?indexing_step, "Update"), + // || must_stop_processing.get(), + // )?; + + let mut indexer = indexer::DocumentOperation::new(method); for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) { match operation { DocumentOperation::Add(content_uuid) => { let content_file = self.file_store.get_update(content_uuid)?; - let reader = DocumentsBatchReader::from_reader(content_file) - .map_err(milli::Error::from)?; - let (new_builder, user_result) = builder.add_documents(reader)?; - builder = new_builder; - - builder = builder.with_embedders(embedders.clone()); + let stats = indexer.add_documents(content_file)?; + // builder = builder.with_embedders(embedders.clone()); let received_documents = if let Some(Details::DocumentAdditionOrUpdate { @@ -1349,30 +1365,17 @@ impl IndexScheduler { unreachable!(); }; - match user_result { - Ok(count) => { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentAdditionOrUpdate { - received_documents, - indexed_documents: Some(count), - }) - } - Err(e) => { - task.status = Status::Failed; - task.details = Some(Details::DocumentAdditionOrUpdate { - received_documents, - indexed_documents: Some(0), - }); - task.error = Some(milli::Error::from(e).into()); - } - } + task.status = Status::Succeeded; + task.details = Some(Details::DocumentAdditionOrUpdate { + received_documents, + indexed_documents: Some(stats.document_count as u64), + }) } DocumentOperation::Delete(document_ids) => { - let (new_builder, user_result) = - builder.remove_documents(document_ids)?; - builder = new_builder; + let count = document_ids.len(); + indexer.delete_documents(document_ids); // Uses Invariant: remove documents actually always returns Ok for the inner result - let count = user_result.unwrap(); + // let count = user_result.unwrap(); let provided_ids = if let Some(Details::DocumentDeletion { provided_ids, .. }) = task.details @@ -1386,15 +1389,26 @@ impl IndexScheduler { task.status = Status::Succeeded; task.details = Some(Details::DocumentDeletion { provided_ids, - deleted_documents: Some(count), + deleted_documents: Some(count as u64), }); } } } if !tasks.iter().all(|res| res.error.is_some()) { - let addition = builder.execute()?; - tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); + let mut fields_ids_map = index.fields_ids_map(&rtxn)?; + /// TODO create a pool if needed + // let pool = indexer_config.thread_pool.unwrap(); + let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); + // let fields_ids_map = RwLock::new(fields_ids_map); + let param = (index, &rtxn, &mut fields_ids_map, &primary_key); + let document_changes = indexer.document_changes(param)?; + indexer::index(index_wtxn, index, &pool, document_changes)?; + + /// TODO we must store it or not? + let fields_ids_map = fields_ids_map; + + // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } else if primary_key_has_been_set { // Everything failed but we've set a primary key. // We need to remove it. diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index f9d7c3704..39d67f20c 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -4,6 +4,8 @@ use serde::{Deserialize, Serialize}; use crate::FieldId; +mod global; + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FieldsIdsMap { names_ids: BTreeMap, diff --git a/milli/src/fields_ids_map/global.rs b/milli/src/fields_ids_map/global.rs new file mode 100644 index 000000000..857d13a2a --- /dev/null +++ b/milli/src/fields_ids_map/global.rs @@ -0,0 +1,84 @@ +use std::collections::BTreeMap; +use std::sync::RwLock; + +use crate::{FieldId, FieldsIdsMap}; + +/// A fields ids map that can be globally updated to add fields +pub struct GlobalFieldsIdsMap<'indexing> { + global: &'indexing RwLock, + local: LocalFieldsIdsMap, +} + +struct LocalFieldsIdsMap { + names_ids: BTreeMap, + ids_names: BTreeMap, +} + +impl LocalFieldsIdsMap { + fn new(global: &RwLock) -> Self { + let global = global.read().unwrap(); + Self { names_ids: global.names_ids.clone(), ids_names: global.ids_names.clone() } + } + + fn insert(&mut self, name: &str, field_id: FieldId) { + self.names_ids.insert(name.to_owned(), field_id); + self.ids_names.insert(field_id, name.to_owned()); + } + + fn name(&self, id: FieldId) -> Option<&str> { + self.ids_names.get(&id).map(String::as_str) + } + + fn id(&self, name: &str) -> Option { + self.names_ids.get(name).copied() + } +} + +impl<'indexing> GlobalFieldsIdsMap<'indexing> { + pub fn new(global: &'indexing RwLock) -> Self { + Self { local: LocalFieldsIdsMap::new(global), global } + } + + /// Returns the field id related to a field name, it will create a new field id if the + /// name is not already known. Returns `None` if the maximum field id as been reached. + pub fn id_or_insert(&mut self, name: &str) -> Option { + if let Some(field_id) = self.local.id(name) { + return Some(field_id); + } + + { + // optimistically lookup the global map + let global = self.global.read().unwrap(); + + if let Some(field_id) = global.id(name) { + self.local.insert(name, field_id); + return Some(field_id); + } + } + + { + let mut global = self.global.write().unwrap(); + + if let Some(field_id) = global.id(name) { + self.local.insert(name, field_id); + return Some(field_id); + } + + let field_id = global.insert(name)?; + self.local.insert(name, field_id); + Some(field_id) + } + } + + /// Get the name of a field based on its id. + pub fn name(&mut self, id: FieldId) -> Option<&str> { + if self.local.name(id).is_none() { + let global = self.global.read().unwrap(); + + let name = global.name(id)?; + self.local.insert(name, id); + } + + self.local.name(id) + } +} diff --git a/milli/src/update/new/global_fields_ids_map.rs b/milli/src/update/new/global_fields_ids_map.rs deleted file mode 100644 index 4bd7b27d9..000000000 --- a/milli/src/update/new/global_fields_ids_map.rs +++ /dev/null @@ -1,65 +0,0 @@ -use std::sync::{Arc, RwLock}; - -use crate::{FieldId, FieldsIdsMap}; - -/// A fields ids map that can be globally updated to add fields -pub struct GlobalFieldsIdsMap { - global: Arc>, - local: FieldsIdsMap, -} - -impl GlobalFieldsIdsMap { - pub fn new(global: FieldsIdsMap) -> Self { - Self { local: global.clone(), global: Arc::new(RwLock::new(global)) } - } - - /// Returns the number of fields ids in the map. - pub fn global_len(&self) -> usize { - todo!() - } - - /// Returns `true` if the map is empty. - pub fn global_is_empty(&self) -> bool { - todo!() - } - - /// Returns the field id related to a field name, it will create a new field id if the - /// name is not already known. Returns `None` if the maximum field id as been reached. - pub fn insert(&mut self, name: &str) -> Option { - match self.names_ids.get(name) { - Some(id) => Some(*id), - None => { - let id = self.next_id?; - self.next_id = id.checked_add(1); - self.names_ids.insert(name.to_owned(), id); - self.ids_names.insert(id, name.to_owned()); - Some(id) - } - } - } - - /// Get the id of a field based on its name. - pub fn id(&self, name: &str) -> Option { - self.names_ids.get(name).copied() - } - - /// Get the name of a field based on its id. - pub fn name(&self, id: FieldId) -> Option<&str> { - self.ids_names.get(&id).map(String::as_str) - } - - /// Iterate over the ids and names in the ids order. - pub fn iter(&self) -> impl Iterator { - self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) - } - - /// Iterate over the ids in the order of the ids. - pub fn ids(&'_ self) -> impl Iterator + '_ { - self.ids_names.keys().copied() - } - - /// Iterate over the names in the order of the ids. - pub fn names(&self) -> impl Iterator { - self.ids_names.values().map(AsRef::as_ref) - } -} diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index 5e43b5816..3444d58f7 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -28,7 +28,7 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion { fn document_changes( self, param: Self::Parameter, - ) -> Result> + 'p> { + ) -> Result> + Clone + 'p> { let (index, fields, primary_key) = param; let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| { diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 26228c354..568df654e 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -34,6 +34,7 @@ pub struct PayloadStats { pub bytes: u64, } +#[derive(Clone)] enum InnerDocOp { Addition(DocumentOffset), Deletion, @@ -41,6 +42,7 @@ enum InnerDocOp { /// Represents an offset where a document lives /// in an mmapped grenad reader file. +#[derive(Clone)] pub struct DocumentOffset { /// The mmapped grenad reader file. pub content: Arc, // grenad::Reader @@ -76,7 +78,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { fn document_changes( self, param: Self::Parameter, - ) -> Result> + 'p> { + ) -> Result> + Clone + 'p> { let (index, rtxn, fields_ids_map, primary_key) = param; let documents_ids = index.documents_ids(rtxn)?; @@ -170,6 +172,11 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { } } + /// TODO is it the best way to provide FieldsIdsMap to the parallel iterator? + let fields_ids_map = fields_ids_map.clone(); + // We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone + let docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect(); + Ok(docids_version_offsets .into_par_iter() .map_with( @@ -177,6 +184,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { move |context_pool, (external_docid, (internal_docid, operations))| { context_pool.with(|rtxn| { use IndexDocumentsMethod as Idm; + let document_merge_function = match self.index_documents_method { Idm::ReplaceDocuments => merge_document_for_replacements, Idm::UpdateDocuments => merge_document_for_updates, @@ -185,7 +193,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { document_merge_function( rtxn, index, - fields_ids_map, + &fields_ids_map, internal_docid, external_docid, &operations, diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index ba4356288..ca5bb71eb 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -1,9 +1,10 @@ +use std::fs::File; use std::thread::{self, Builder}; use big_s::S; pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; -use heed::RwTxn; +use heed::{RoTxn, RwTxn}; pub use partial_dump::PartialDump; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::ThreadPool; @@ -15,7 +16,11 @@ use super::channel::{ }; use super::document_change::DocumentChange; use super::merger::merge_grenad_entries; -use crate::{Index, Result}; +use super::StdResult; +use crate::documents::{ + obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY, +}; +use crate::{Index, Result, UserError}; mod document_deletion; mod document_operation; @@ -28,7 +33,7 @@ pub trait DocumentChanges<'p> { fn document_changes( self, param: Self::Parameter, - ) -> Result> + 'p>; + ) -> Result> + Clone + 'p>; } /// This is the main function of this crate. @@ -40,7 +45,7 @@ pub fn index( wtxn: &mut RwTxn, index: &Index, pool: &ThreadPool, - document_changes: PI, + _document_changes: PI, ) -> Result<()> where PI: IntoParallelIterator> + Send, @@ -88,3 +93,56 @@ where Ok(()) }) } + +/// TODO move this elsewhere +pub fn guess_primary_key<'a>( + rtxn: &'a RoTxn<'a>, + index: &Index, + mut cursor: DocumentsBatchCursor, + documents_batch_index: &'a DocumentsBatchIndex, +) -> Result, UserError>> { + // The primary key *field id* that has already been set for this index or the one + // we will guess by searching for the first key that contains "id" as a substring. + match index.primary_key(rtxn)? { + Some(primary_key) => match PrimaryKey::new(primary_key, documents_batch_index) { + Some(primary_key) => Ok(Ok(primary_key)), + None => match cursor.next_document()? { + Some(first_document) => Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(first_document, documents_batch_index)?, + })), + None => unreachable!("Called with reader.is_empty()"), + }, + }, + None => { + let mut guesses: Vec<(u16, &str)> = documents_batch_index + .iter() + .filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) + .map(|(field_id, name)| (*field_id, name.as_str())) + .collect(); + + // sort the keys in a deterministic, obvious way, so that fields are always in the same order. + guesses.sort_by(|(_, left_name), (_, right_name)| { + // shortest name first + left_name.len().cmp(&right_name.len()).then_with( + // then alphabetical order + || left_name.cmp(right_name), + ) + }); + + match guesses.as_slice() { + [] => Ok(Err(UserError::NoPrimaryKeyCandidateFound)), + [(field_id, name)] => { + tracing::info!("Primary key was not specified in index. Inferred to '{name}'"); + Ok(Ok(PrimaryKey::Flat { name, field_id: *field_id })) + } + multiple => Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { + candidates: multiple + .iter() + .map(|(_, candidate)| candidate.to_string()) + .collect(), + })), + } + } + } +} diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index d324322a7..6699a6ba7 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -7,7 +7,7 @@ use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId}; use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; pub struct PartialDump { - pub iter: I, + iter: I, } impl PartialDump { @@ -19,7 +19,7 @@ impl PartialDump { impl<'p, I> DocumentChanges<'p> for PartialDump where I: IntoIterator, - I::IntoIter: Send + 'p, + I::IntoIter: Send + Clone + 'p, I::Item: Send, { type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>); @@ -31,7 +31,7 @@ where fn document_changes( self, param: Self::Parameter, - ) -> Result> + 'p> { + ) -> Result> + Clone + 'p> { let (fields_ids_map, concurrent_available_ids, primary_key) = param; Ok(self.iter.into_iter().par_bridge().map(|object| { diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index 91e1fd4ee..fc908e31a 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -12,8 +12,7 @@ impl<'p> DocumentChanges<'p> for UpdateByFunction { fn document_changes( self, _param: Self::Parameter, - ) -> Result> + 'p> { - todo!(); - Ok(vec![].into_par_iter()) + ) -> Result> + Clone + 'p> { + Ok((0..100).into_par_iter().map(|_| todo!())) } } diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index cd94bd5d2..ad61d8343 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -7,8 +7,8 @@ use crate::FieldId; mod document_change; mod merger; // mod extract; -// mod global_fields_ids_map; mod channel; +//mod global_fields_ids_map; pub mod indexer; mod items_pool; From 5369bf4a62bf411573094008f9141f6784bede0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Sep 2024 19:51:22 +0200 Subject: [PATCH 022/247] Change some lifetimes --- milli/src/update/new/indexer/document_operation.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 568df654e..f5dcfcfe6 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -73,7 +73,7 @@ impl DocumentOperation { } impl<'p> DocumentChanges<'p> for DocumentOperation { - type Parameter = (&'p Index, &'p RoTxn<'static>, &'p mut FieldsIdsMap, &'p PrimaryKey<'p>); + type Parameter = (&'p Index, &'p RoTxn<'p>, &'p mut FieldsIdsMap, &'p PrimaryKey<'p>); fn document_changes( self, From c50d3edc4a0cb0cc55b6b59c906342aef2dbbcf5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 3 Sep 2024 11:02:39 +0200 Subject: [PATCH 023/247] Integrate first searchable exctrator --- milli/src/update/new/channel.rs | 17 +- milli/src/update/new/document_change.rs | 26 +- milli/src/update/new/extract/cache.rs | 4 +- .../update/new/extract/extract_word_docids.rs | 250 +++++++++++----- milli/src/update/new/extract/mod.rs | 4 + .../update/new/extract/tokenize_document.rs | 268 +++++++++++++----- milli/src/update/new/indexer/mod.rs | 18 +- milli/src/update/new/merger.rs | 6 +- milli/src/update/new/mod.rs | 7 +- 9 files changed, 419 insertions(+), 181 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 4041fcc6a..d94b2cc00 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -1,10 +1,12 @@ use std::fs::File; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; +use grenad::Merger; use heed::types::Bytes; use super::StdResult; use crate::update::new::KvReaderFieldId; +use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{DocumentId, Index}; /// The capacity of the channel is currently in number of messages. @@ -159,7 +161,7 @@ impl DocumentSender { } pub enum MergerOperation { - WordDocidsCursors(Vec>), + WordDocidsMerger(Merger), } pub struct MergerReceiver(Receiver); @@ -175,3 +177,16 @@ impl IntoIterator for MergerReceiver { #[derive(Clone)] pub struct DeladdCboRoaringBitmapSender(Sender); + +impl DeladdCboRoaringBitmapSender { + pub fn word_docids( + &self, + merger: Merger, + ) -> StdResult<(), SendError<()>> { + let operation = MergerOperation::WordDocidsMerger(merger); + match self.0.send(operation) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } +} diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index 6f9d767cb..9076f32db 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -2,7 +2,7 @@ use heed::RoTxn; use obkv::KvReader; use crate::update::new::KvReaderFieldId; -use crate::{DocumentId, FieldId}; +use crate::{DocumentId, FieldId, Index}; pub enum DocumentChange { Deletion(Deletion), @@ -12,14 +12,14 @@ pub enum DocumentChange { pub struct Deletion { docid: DocumentId, - external_docid: String, // ? - current: Box, + external_docid: String, // ? + current: Box, // ? } pub struct Update { docid: DocumentId, - external_docid: String, // ? - current: Box, + external_docid: String, // ? + current: Box, // ? new: Box, } @@ -30,7 +30,7 @@ pub struct Insertion { } impl DocumentChange { - fn docid(&self) -> DocumentId { + pub fn docid(&self) -> DocumentId { match &self { Self::Deletion(inner) => inner.docid(), Self::Update(inner) => inner.docid(), @@ -48,11 +48,11 @@ impl Deletion { Self { docid, external_docid, current } } - fn docid(&self) -> DocumentId { + pub fn docid(&self) -> DocumentId { self.docid } - fn current(&self, rtxn: &RoTxn) -> &KvReader { + pub fn current(&self, rtxn: &RoTxn, index: &Index) -> &KvReader { unimplemented!() } } @@ -62,11 +62,11 @@ impl Insertion { Insertion { docid, external_docid, new } } - fn docid(&self) -> DocumentId { + pub fn docid(&self) -> DocumentId { self.docid } - fn new(&self) -> &KvReader { + pub fn new(&self) -> &KvReader { unimplemented!() } } @@ -81,15 +81,15 @@ impl Update { Update { docid, external_docid, current, new } } - fn docid(&self) -> DocumentId { + pub fn docid(&self) -> DocumentId { self.docid } - fn current(&self, rtxn: &RoTxn) -> &KvReader { + pub fn current(&self, rtxn: &RoTxn, index: &Index) -> &KvReader { unimplemented!() } - fn new(&self) -> &KvReader { + pub fn new(&self) -> &KvReader { unimplemented!() } } diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 0d72a5a8d..878150eb3 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -2,12 +2,12 @@ use std::borrow::Cow; use std::num::NonZeroUsize; use std::{io, mem}; -use grenad2::{MergeFunction, Sorter}; +use grenad::{MergeFunction, Sorter}; use lru::LruCache; use roaring::RoaringBitmap; use smallvec::SmallVec; -use crate::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; #[derive(Debug)] pub struct CachedSorter { diff --git a/milli/src/update/new/extract/extract_word_docids.rs b/milli/src/update/new/extract/extract_word_docids.rs index e2e1520bc..e2261748a 100644 --- a/milli/src/update/new/extract/extract_word_docids.rs +++ b/milli/src/update/new/extract/extract_word_docids.rs @@ -1,84 +1,180 @@ -pub fn extract_word_docids( - document_change: DocumentChange, - _tokenizer: &Tokenizer, - output: &mut CachedSorter, -) -> grenad::Result<(), io::Error> { - match document_change { - DocumentChange::Deletion(inner) => { - unimplemented!() - } - DocumentChange::Update(inner) => { - unimplemented!() - } - DocumentChange::Insertion(inner) => { - unimplemented!() +use std::fs::File; + +use charabia::TokenizerBuilder; +use grenad::Merger; +use grenad::ReaderCursor; +use heed::RoTxn; +use rayon::iter::IntoParallelIterator; +use rayon::iter::ParallelBridge; +use rayon::iter::ParallelIterator; + +use crate::update::MergeDeladdCboRoaringBitmaps; +use crate::{ + update::{ + create_sorter, + new::{DocumentChange, ItemsPool}, + GrenadParameters, + }, + FieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE, +}; + +use super::{ + cache::{CachedSorter, DelAddRoaringBitmapMerger}, + tokenize_document::DocumentTokenizer, +}; + +pub trait SearchableExtractor { + fn run_extraction( + index: &Index, + fields_ids_map: &FieldsIdsMap, + indexer: GrenadParameters, + document_changes: impl IntoParallelIterator>, + ) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let rtxn = index.read_txn()?; + let stop_words = index.stop_words(&rtxn)?; + let allowed_separators = index.allowed_separators(&rtxn)?; + let allowed_separators: Option> = + allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let dictionary = index.dictionary(&rtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let builder = tokenizer_builder( + stop_words.as_ref(), + allowed_separators.as_deref(), + dictionary.as_deref(), + ); + let tokenizer = builder.into_tokenizer(); + + let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn)?; + let localized_attributes_rules = + index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); + + let document_tokenizer = DocumentTokenizer { + tokenizer: &tokenizer, + searchable_attributes: user_defined_searchable_fields.as_deref(), + localized_attributes_rules: &localized_attributes_rules, + max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, + }; + + let context_pool = ItemsPool::new(|| { + Ok(( + index.read_txn()?, + &document_tokenizer, + CachedSorter::new( + // TODO use a better value + 100.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + DelAddRoaringBitmapMerger, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + ), + )) + }); + + document_changes.into_par_iter().try_for_each(|document_change| { + context_pool.with(|(rtxn, document_tokenizer, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + document_tokenizer, + &fields_ids_map, + cached_sorter, + document_change?, + ) + }) + })?; + + let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); + for (_rtxn, _tokenizer, cache) in context_pool.into_items() { + let sorter = cache.into_sorter()?; + let readers = sorter.into_reader_cursors()?; + builder.extend(readers); } + + Ok(builder.build()) } - let normalizer_options = NormalizerOption::default(); - - if let Some(previous_doc) = previous_doc { - for (_, v) in previous_doc.iter() { - // Only manage the direct JSON strings - // TODO manage the JSON strings correctly (escaped chars) - if v.first().zip(v.last()) == Some((&b'"', &b'"')) { - let s = std::str::from_utf8(&v[1..v.len() - 1]).unwrap(); - // for token in tokenizer.tokenize(s).filter(|t| t.is_word()) { - // let key = token.lemma().normalize(&normalizer_options); - for token in s.split_whitespace() { - let key = token.normalize(&normalizer_options); - output.insert_del_u32(key.as_bytes(), docid)?; - } - } - } - } - - for (_, v) in new_doc.iter() { - // Only manage the direct JSON strings - // TODO manage the JSON strings correctly (escaped chars) - if v.first().zip(v.last()) == Some((&b'"', &b'"')) { - let s = std::str::from_utf8(&v[1..v.len() - 1]).unwrap(); - // for token in tokenizer.tokenize(s).filter(|t| t.is_word()) { - // let key = token.lemma().normalize(&normalizer_options); - for token in s.split_whitespace() { - let key = token.normalize(&normalizer_options); - output.insert_add_u32(key.as_bytes(), docid)?; - } - } - } - - Ok(()) + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + document_tokenizer: &DocumentTokenizer, + fields_ids_map: &FieldsIdsMap, + cached_sorter: &mut CachedSorter, + document_change: DocumentChange, + ) -> Result<()>; } -/// take an iterator on tokens and compute their relative position depending on separator kinds -/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, -/// else we keep the standard proximity of 1 between words. -fn process_tokens<'a>( - tokens: impl Iterator>, -) -> impl Iterator)> { - tokens - .skip_while(|token| token.is_separator()) - .scan((0, None), |(offset, prev_kind), mut token| { - match token.kind { - TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { - *offset += match *prev_kind { - Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, - Some(_) => 1, - None => 0, - }; - *prev_kind = Some(token.kind) - } - TokenKind::Separator(SeparatorKind::Hard) => { - *prev_kind = Some(token.kind); - } - TokenKind::Separator(SeparatorKind::Soft) - if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => - { - *prev_kind = Some(token.kind); - } - _ => token.kind = TokenKind::Unknown, +pub struct WordDocidsExtractor; +impl SearchableExtractor for WordDocidsExtractor { + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + document_tokenizer: &DocumentTokenizer, + fields_ids_map: &FieldsIdsMap, + // TODO: DelAddRoaringBitmapMerger should be CBO + cached_sorter: &mut CachedSorter, + document_change: DocumentChange, + ) -> crate::Result<()> { + match document_change { + DocumentChange::Deletion(inner) => { + let mut token_fn = |_fid, _pos: u16, word: &str| { + cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap(); + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index), + fields_ids_map, + &mut token_fn, + )?; } - Some((*offset, token)) - }) - .filter(|(_, t)| t.is_word()) + DocumentChange::Update(inner) => { + let mut token_fn = |_fid, _pos, word: &str| { + cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap(); + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index), + fields_ids_map, + &mut token_fn, + )?; + + let mut token_fn = |_fid, _pos, word: &str| { + cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap(); + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + } + DocumentChange::Insertion(inner) => { + let mut token_fn = |_fid, _pos, word: &str| { + cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap(); + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + } + } + + Ok(()) + } +} + +/// Factorize tokenizer building. +fn tokenizer_builder<'a>( + stop_words: Option<&'a fst::Set<&'a [u8]>>, + allowed_separators: Option<&'a [&str]>, + dictionary: Option<&'a [&str]>, +) -> TokenizerBuilder<'a, &'a [u8]> { + let mut tokenizer_builder = TokenizerBuilder::new(); + if let Some(stop_words) = stop_words { + tokenizer_builder.stop_words(stop_words); + } + if let Some(dictionary) = dictionary { + tokenizer_builder.words_dict(dictionary); + } + if let Some(separators) = allowed_separators { + tokenizer_builder.separators(separators); + } + + tokenizer_builder } diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 26732d4c8..3124068d9 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -1,2 +1,6 @@ mod cache; mod extract_word_docids; +mod tokenize_document; + +pub use extract_word_docids::SearchableExtractor; +pub use extract_word_docids::WordDocidsExtractor; diff --git a/milli/src/update/new/extract/tokenize_document.rs b/milli/src/update/new/extract/tokenize_document.rs index 8793063b0..40f0b4374 100644 --- a/milli/src/update/new/extract/tokenize_document.rs +++ b/milli/src/update/new/extract/tokenize_document.rs @@ -1,56 +1,71 @@ -pub struct DocumentTokenizer { - tokenizer: &Tokenizer, - searchable_attributes: Option<&[String]>, - localized_attributes_rules: &[LocalizedAttributesRule], - max_positions_per_attributes: u32, +use crate::{ + update::new::KvReaderFieldId, FieldId, FieldsIdsMap, Index, InternalError, + LocalizedAttributesRule, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, +}; +use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; +use heed::RoTxn; +use serde_json::Value; +use std::collections::HashMap; + +pub struct DocumentTokenizer<'a> { + pub tokenizer: &'a Tokenizer<'a>, + pub searchable_attributes: Option<&'a [&'a str]>, + pub localized_attributes_rules: &'a [LocalizedAttributesRule], + pub max_positions_per_attributes: u32, } -impl DocumentTokenizer { - // pub fn new(tokenizer: &Tokenizer, settings: &InnerIndexSettings) -> Self { - // Self { tokenizer, settings } - // } - - pub fn tokenize_document<'a>( - obkv: &KvReader<'a, FieldId>, +impl<'a> DocumentTokenizer<'a> { + pub fn tokenize_document( + &self, + obkv: &KvReaderFieldId, field_id_map: &FieldsIdsMap, - token_fn: impl Fn(FieldId, u16, &str), - ) { - let mut field_position = Hashmap::new(); + token_fn: &mut impl FnMut(FieldId, u16, &str), + ) -> Result<()> { + let mut field_position = HashMap::new(); for (field_id, field_bytes) in obkv { - let field_name = field_id_map.name(field_id); + let Some(field_name) = field_id_map.name(field_id) else { + unreachable!("field id not found in field id map"); + }; + + let mut tokenize_field = |name: &str, value: &Value| { + let Some(field_id) = field_id_map.id(name) else { + unreachable!("field name not found in field id map"); + }; + + let position = + field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0); + if *position as u32 >= self.max_positions_per_attributes { + return; + } - let tokenize_field = |name, value| { - let field_id = field_id_map.id(name); match value { - Number(n) => { + Value::Number(n) => { let token = n.to_string(); - let position = field_position - .entry(field_id) - .and_modify(|counter| *counter += 8) - .or_insert(0); - token_fn(field_id, position, token.as_str()); + if let Ok(position) = (*position).try_into() { + token_fn(field_id, position, token.as_str()); + } } - String(text) => { + Value::String(text) => { // create an iterator of token with their positions. let locales = self .localized_attributes_rules .iter() - .first(|rule| rule.match_str(field_name)) - .map(|rule| rule.locales(field_id)); - let tokens = - process_tokens(tokenizer.tokenize_with_allow_list(field, locales)) - .take_while(|(p, _)| { - (*p as u32) < self.max_positions_per_attributes - }); + .find(|rule| rule.match_str(field_name)) + .map(|rule| rule.locales()); + let tokens = process_tokens( + *position, + self.tokenizer.tokenize_with_allow_list(text.as_str(), locales), + ) + .take_while(|(p, _)| (*p as u32) < self.max_positions_per_attributes); for (index, token) in tokens { // keep a word only if it is not empty and fit in a LMDB key. let token = token.lemma().trim(); if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - let position: u16 = index - .try_into() - .map_err(|_| SerializationError::InvalidNumberSerialization)?; - writer.insert(position, token.as_bytes())?; + *position = index; + if let Ok(position) = (*position).try_into() { + token_fn(field_id, position, token); + } } } } @@ -59,21 +74,28 @@ impl DocumentTokenizer { }; // if the current field is searchable or contains a searchable attribute - if searchable_attributes.map_or(true, |attributes| { - attributes.iter().any(|name| contained_in(name, field_name)) + if self.searchable_attributes.map_or(true, |attributes| { + attributes.iter().any(|name| perm_json_p::contained_in(name, field_name)) }) { // parse json. match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { - Value::Object(object) => { - seek_leaf_values_in_object(object, selectors, &field_name, tokenize_field) - } - Value::Array(array) => { - seek_leaf_values_in_array(array, selectors, &field_name, tokenize_field) - } - value => tokenize_field(&base_key, value), + Value::Object(object) => perm_json_p::seek_leaf_values_in_object( + &object, + self.searchable_attributes.as_deref(), + &field_name, + &mut tokenize_field, + ), + Value::Array(array) => perm_json_p::seek_leaf_values_in_array( + &array, + self.searchable_attributes.as_deref(), + &field_name, + &mut tokenize_field, + ), + value => tokenize_field(&field_name, &value), } } } + Ok(()) } } @@ -81,11 +103,12 @@ impl DocumentTokenizer { /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, /// else we keep the standard proximity of 1 between words. fn process_tokens<'a>( + start_offset: usize, tokens: impl Iterator>, ) -> impl Iterator)> { tokens .skip_while(|token| token.is_separator()) - .scan((0, None), |(offset, prev_kind), mut token| { + .scan((start_offset, None), |(offset, prev_kind), mut token| { match token.kind { TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { *offset += match *prev_kind { @@ -110,42 +133,45 @@ fn process_tokens<'a>( .filter(|(_, t)| t.is_word()) } -/// Returns `true` if the `selector` match the `key`. -/// -/// ```text -/// Example: -/// `animaux` match `animaux` -/// `animaux.chien` match `animaux` -/// `animaux.chien` match `animaux` -/// `animaux.chien.nom` match `animaux` -/// `animaux.chien.nom` match `animaux.chien` -/// ----------------------------------------- -/// `animaux` doesn't match `animaux.chien` -/// `animaux.` doesn't match `animaux` -/// `animaux.ch` doesn't match `animaux.chien` -/// `animau` doesn't match `animaux` -/// ``` -fn contained_in(selector: &str, key: &str) -> bool { - selector.starts_with(key) - && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) -} - /// TODO move in permissive json pointer mod perm_json_p { + use serde_json::{Map, Value}; + const SPLIT_SYMBOL: char = '.'; + + /// Returns `true` if the `selector` match the `key`. + /// + /// ```text + /// Example: + /// `animaux` match `animaux` + /// `animaux.chien` match `animaux` + /// `animaux.chien` match `animaux` + /// `animaux.chien.nom` match `animaux` + /// `animaux.chien.nom` match `animaux.chien` + /// ----------------------------------------- + /// `animaux` doesn't match `animaux.chien` + /// `animaux.` doesn't match `animaux` + /// `animaux.ch` doesn't match `animaux.chien` + /// `animau` doesn't match `animaux` + /// ``` + pub fn contained_in(selector: &str, key: &str) -> bool { + selector.starts_with(key) + && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) + } + pub fn seek_leaf_values<'a>( value: &Map, selectors: impl IntoIterator, - seeker: impl Fn(&str, &Value), + seeker: &mut impl FnMut(&str, &Value), ) { let selectors: Vec<_> = selectors.into_iter().collect(); - seek_leaf_values_in_object(value, &selectors, "", &seeker); + seek_leaf_values_in_object(value, Some(&selectors), "", seeker); } pub fn seek_leaf_values_in_object( value: &Map, - selectors: &[&str], + selectors: Option<&[&str]>, base_key: &str, - seeker: &impl Fn(&str, &Value), + seeker: &mut impl FnMut(&str, &Value), ) { for (key, value) in value.iter() { let base_key = if base_key.is_empty() { @@ -156,8 +182,10 @@ mod perm_json_p { // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` // so we check the contained_in on both side - let should_continue = selectors.iter().any(|selector| { - contained_in(selector, &base_key) || contained_in(&base_key, selector) + let should_continue = selectors.map_or(true, |selectors| { + selectors.iter().any(|selector| { + contained_in(selector, &base_key) || contained_in(&base_key, selector) + }) }); if should_continue { @@ -175,12 +203,12 @@ mod perm_json_p { } pub fn seek_leaf_values_in_array( - values: &mut [Value], - selectors: &[&str], + values: &[Value], + selectors: Option<&[&str]>, base_key: &str, - seeker: &impl Fn(&str, &Value), + seeker: &mut impl FnMut(&str, &Value), ) { - for value in values.iter_mut() { + for value in values { match value { Value::Object(object) => { seek_leaf_values_in_object(object, selectors, base_key, seeker) @@ -193,3 +221,91 @@ mod perm_json_p { } } } + +#[cfg(test)] +mod test { + use super::*; + use charabia::TokenizerBuilder; + use meili_snap::snapshot; + use obkv::KvReader; + use serde_json::json; + #[test] + fn test_tokenize_document() { + let mut fields_ids_map = FieldsIdsMap::new(); + + let field_1 = json!({ + "name": "doggo", + "age": 10, + }); + + let field_2 = json!({ + "catto": { + "name": "pesti", + "age": 23, + } + }); + + let field_3 = json!(["doggo", "catto"]); + + let mut obkv = obkv::KvWriter::memory(); + let field_1_id = fields_ids_map.insert("doggo").unwrap(); + let field_1 = serde_json::to_string(&field_1).unwrap(); + obkv.insert(field_1_id, field_1.as_bytes()).unwrap(); + let field_2_id = fields_ids_map.insert("catto").unwrap(); + let field_2 = serde_json::to_string(&field_2).unwrap(); + obkv.insert(field_2_id, field_2.as_bytes()).unwrap(); + let field_3_id = fields_ids_map.insert("doggo.name").unwrap(); + let field_3 = serde_json::to_string(&field_3).unwrap(); + obkv.insert(field_3_id, field_3.as_bytes()).unwrap(); + let value = obkv.into_inner().unwrap(); + let obkv = KvReader::from_slice(value.as_slice()); + + fields_ids_map.insert("doggo.age"); + fields_ids_map.insert("catto.catto.name"); + fields_ids_map.insert("catto.catto.age"); + + let mut tb = TokenizerBuilder::default(); + let document_tokenizer = DocumentTokenizer { + tokenizer: &tb.build(), + searchable_attributes: None, + localized_attributes_rules: &[], + max_positions_per_attributes: 1000, + }; + + let mut words = std::collections::BTreeMap::new(); + document_tokenizer + .tokenize_document(obkv, &fields_ids_map, &mut |fid, pos, word| { + words.insert([fid, pos], word.to_string()); + }) + .unwrap(); + + snapshot!(format!("{:#?}", words), @r###" + { + [ + 2, + 0, + ]: "doggo", + [ + 2, + 8, + ]: "doggo", + [ + 2, + 16, + ]: "catto", + [ + 3, + 0, + ]: "10", + [ + 4, + 0, + ]: "pesti", + [ + 5, + 0, + ]: "23", + } + "###); + } +} diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index ca5bb71eb..ebbb8582c 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -15,11 +15,13 @@ use super::channel::{ WriterOperation, }; use super::document_change::DocumentChange; +use super::extract::{SearchableExtractor, WordDocidsExtractor}; use super::merger::merge_grenad_entries; use super::StdResult; use crate::documents::{ obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY, }; +use crate::update::GrenadParameters; use crate::{Index, Result, UserError}; mod document_deletion; @@ -45,7 +47,7 @@ pub fn index( wtxn: &mut RwTxn, index: &Index, pool: &ThreadPool, - _document_changes: PI, + document_changes: PI, ) -> Result<()> where PI: IntoParallelIterator> + Send, @@ -59,10 +61,18 @@ where // TODO manage the errors correctly let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { pool.in_place_scope(|_s| { + let document_changes = document_changes.into_par_iter(); // word docids - // document_changes.into_par_iter().try_for_each(|_dc| Ok(()) as Result<_>) - // let grenads = extractor_function(document_changes)?; - // deladd_cbo_roaring_bitmap_sender.word_docids(grenads)?; + let merger = WordDocidsExtractor::run_extraction( + index, + todo!(), + /// TODO: GrenadParameters::default() should be removed in favor a passed parameter + GrenadParameters::default(), + document_changes.clone(), + )?; + + /// TODO: manage the errors correctly + deladd_cbo_roaring_bitmap_sender.word_docids(merger).unwrap(); Ok(()) as Result<_> }) diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 97f9e6ac6..89d0762f0 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -20,14 +20,12 @@ pub fn merge_grenad_entries( for merger_operation in receiver { match merger_operation { - MergerOperation::WordDocidsCursors(cursors) => { + MergerOperation::WordDocidsMerger(merger) => { let sender = sender.word_docids(); let database = index.word_docids.remap_types::(); - let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); - builder.extend(cursors); /// TODO manage the error correctly - let mut merger_iter = builder.build().into_stream_merger_iter().unwrap(); + let mut merger_iter = merger.into_stream_merger_iter().unwrap(); // TODO manage the error correctly while let Some((key, deladd)) = merger_iter.next().unwrap() { diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index ad61d8343..31a017c12 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -4,13 +4,12 @@ pub use items_pool::ItemsPool; use super::del_add::DelAdd; use crate::FieldId; -mod document_change; -mod merger; -// mod extract; mod channel; -//mod global_fields_ids_map; +mod document_change; +mod extract; pub mod indexer; mod items_pool; +mod merger; /// TODO move them elsewhere pub type StdResult = std::result::Result; From c1557734dc59c90241d630cac81dfdac2bdda225 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 3 Sep 2024 12:01:01 +0200 Subject: [PATCH 024/247] Use the GlobalFieldsIdsMap everywhere and write it to disk Co-authored-by: Dureuill Co-authored-by: ManyTheFish --- index-scheduler/src/batch.rs | 55 ++----------------- milli/src/fields_ids_map.rs | 1 + milli/src/fields_ids_map/global.rs | 2 + milli/src/lib.rs | 2 +- .../update/new/extract/extract_word_docids.rs | 46 ++++++---------- .../update/new/extract/tokenize_document.rs | 32 +++++++---- .../update/new/indexer/document_deletion.rs | 1 + .../update/new/indexer/document_operation.rs | 7 ++- milli/src/update/new/indexer/mod.rs | 19 +++++-- milli/src/update/new/indexer/partial_dump.rs | 1 + .../update/new/indexer/update_by_function.rs | 3 +- 11 files changed, 70 insertions(+), 99 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 6ec2b17bf..ecb44fc14 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -36,7 +36,7 @@ use meilisearch_types::milli::update::{ use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; -use meilisearch_types::milli::{self, Filter, Object, UserError}; +use meilisearch_types::milli::{self, Filter, GlobalFieldsIdsMap, Object, UserError}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; @@ -1302,49 +1302,6 @@ impl IndexScheduler { let primary_key = guess_primary_key(&rtxn, index, cursor, &documents_batch_index)?.unwrap(); - // if let Some(primary_key) = primary_key { - // match index.primary_key(index_wtxn)? { - // // if a primary key was set AND had already been defined in the index - // // but to a different value, we can make the whole batch fail. - // Some(pk) => { - // if primary_key != pk { - // return Err(milli::Error::from( - // milli::UserError::PrimaryKeyCannotBeChanged(pk.to_string()), - // ) - // .into()); - // } - // } - // // if the primary key was set and there was no primary key set for this index - // // we set it to the received value before starting the indexing process. - // None => { - // todo!(); - // let mut builder = - // milli::update::Settings::new(index_wtxn, index, indexer_config); - // builder.set_primary_key(primary_key); - // builder.execute( - // |indexing_step| tracing::debug!(update = ?indexing_step), - // || must_stop_processing.clone().get(), - // )?; - // primary_key_has_been_set = true; - // } - // } - // } - - // let config = IndexDocumentsConfig { update_method: method, ..Default::default() }; - - // let embedder_configs = index.embedding_configs(index_wtxn)?; - // // TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense) - // let embedders = self.embedders(embedder_configs)?; - - // let mut builder = milli::update::IndexDocuments::new( - // index_wtxn, - // index, - // indexer_config, - // config, - // |indexing_step| tracing::trace!(?indexing_step, "Update"), - // || must_stop_processing.get(), - // )?; - let mut indexer = indexer::DocumentOperation::new(method); for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) { match operation { @@ -1401,12 +1358,10 @@ impl IndexScheduler { // let pool = indexer_config.thread_pool.unwrap(); let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); // let fields_ids_map = RwLock::new(fields_ids_map); - let param = (index, &rtxn, &mut fields_ids_map, &primary_key); - let document_changes = indexer.document_changes(param)?; - indexer::index(index_wtxn, index, &pool, document_changes)?; - - /// TODO we must store it or not? - let fields_ids_map = fields_ids_map; + let param = (index, &rtxn, &primary_key); + let document_changes = indexer.document_changes(&mut fields_ids_map, param)?; + /// TODO pass/write the FieldsIdsMap + indexer::index(index_wtxn, index, fields_ids_map, &pool, document_changes)?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } else if primary_key_has_been_set { diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 39d67f20c..52e02045d 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize}; use crate::FieldId; mod global; +pub use global::GlobalFieldsIdsMap; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FieldsIdsMap { diff --git a/milli/src/fields_ids_map/global.rs b/milli/src/fields_ids_map/global.rs index 857d13a2a..93908aea8 100644 --- a/milli/src/fields_ids_map/global.rs +++ b/milli/src/fields_ids_map/global.rs @@ -4,11 +4,13 @@ use std::sync::RwLock; use crate::{FieldId, FieldsIdsMap}; /// A fields ids map that can be globally updated to add fields +#[derive(Debug, Clone)] pub struct GlobalFieldsIdsMap<'indexing> { global: &'indexing RwLock, local: LocalFieldsIdsMap, } +#[derive(Debug, Clone)] struct LocalFieldsIdsMap { names_ids: BTreeMap, ids_names: BTreeMap, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 8b2468bea..45418c074 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -55,7 +55,7 @@ pub use self::error::{ }; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fieldids_weights_map::FieldidsWeightsMap; -pub use self::fields_ids_map::FieldsIdsMap; +pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap}; pub use self::heed_codec::{ BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, diff --git a/milli/src/update/new/extract/extract_word_docids.rs b/milli/src/update/new/extract/extract_word_docids.rs index e2261748a..1f52ee086 100644 --- a/milli/src/update/new/extract/extract_word_docids.rs +++ b/milli/src/update/new/extract/extract_word_docids.rs @@ -1,32 +1,20 @@ use std::fs::File; use charabia::TokenizerBuilder; -use grenad::Merger; -use grenad::ReaderCursor; +use grenad::{Merger, ReaderCursor}; use heed::RoTxn; -use rayon::iter::IntoParallelIterator; -use rayon::iter::ParallelBridge; -use rayon::iter::ParallelIterator; +use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; -use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{ - update::{ - create_sorter, - new::{DocumentChange, ItemsPool}, - GrenadParameters, - }, - FieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE, -}; - -use super::{ - cache::{CachedSorter, DelAddRoaringBitmapMerger}, - tokenize_document::DocumentTokenizer, -}; +use super::cache::CachedSorter; +use super::tokenize_document::DocumentTokenizer; +use crate::update::new::{DocumentChange, ItemsPool}; +use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; pub trait SearchableExtractor { fn run_extraction( index: &Index, - fields_ids_map: &FieldsIdsMap, + fields_ids_map: &GlobalFieldsIdsMap, indexer: GrenadParameters, document_changes: impl IntoParallelIterator>, ) -> Result> { @@ -62,12 +50,13 @@ pub trait SearchableExtractor { Ok(( index.read_txn()?, &document_tokenizer, + fields_ids_map.clone(), CachedSorter::new( // TODO use a better value 100.try_into().unwrap(), create_sorter( grenad::SortAlgorithm::Stable, - DelAddRoaringBitmapMerger, + MergeDeladdCboRoaringBitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -78,12 +67,12 @@ pub trait SearchableExtractor { }); document_changes.into_par_iter().try_for_each(|document_change| { - context_pool.with(|(rtxn, document_tokenizer, cached_sorter)| { + context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { Self::extract_document_change( &*rtxn, index, document_tokenizer, - &fields_ids_map, + fields_ids_map, cached_sorter, document_change?, ) @@ -91,7 +80,7 @@ pub trait SearchableExtractor { })?; let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); - for (_rtxn, _tokenizer, cache) in context_pool.into_items() { + for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() { let sorter = cache.into_sorter()?; let readers = sorter.into_reader_cursors()?; builder.extend(readers); @@ -104,8 +93,8 @@ pub trait SearchableExtractor { rtxn: &RoTxn, index: &Index, document_tokenizer: &DocumentTokenizer, - fields_ids_map: &FieldsIdsMap, - cached_sorter: &mut CachedSorter, + fields_ids_map: &mut GlobalFieldsIdsMap, + cached_sorter: &mut CachedSorter, document_change: DocumentChange, ) -> Result<()>; } @@ -116,9 +105,8 @@ impl SearchableExtractor for WordDocidsExtractor { rtxn: &RoTxn, index: &Index, document_tokenizer: &DocumentTokenizer, - fields_ids_map: &FieldsIdsMap, - // TODO: DelAddRoaringBitmapMerger should be CBO - cached_sorter: &mut CachedSorter, + fields_ids_map: &mut GlobalFieldsIdsMap, + cached_sorter: &mut CachedSorter, document_change: DocumentChange, ) -> crate::Result<()> { match document_change { diff --git a/milli/src/update/new/extract/tokenize_document.rs b/milli/src/update/new/extract/tokenize_document.rs index 40f0b4374..9f0a1c4d8 100644 --- a/milli/src/update/new/extract/tokenize_document.rs +++ b/milli/src/update/new/extract/tokenize_document.rs @@ -1,11 +1,14 @@ -use crate::{ - update::new::KvReaderFieldId, FieldId, FieldsIdsMap, Index, InternalError, - LocalizedAttributesRule, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, -}; +use std::collections::HashMap; + use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use heed::RoTxn; use serde_json::Value; -use std::collections::HashMap; + +use crate::update::new::KvReaderFieldId; +use crate::{ + FieldId, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule, + Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, +}; pub struct DocumentTokenizer<'a> { pub tokenizer: &'a Tokenizer<'a>, @@ -18,18 +21,24 @@ impl<'a> DocumentTokenizer<'a> { pub fn tokenize_document( &self, obkv: &KvReaderFieldId, - field_id_map: &FieldsIdsMap, + field_id_map: &mut GlobalFieldsIdsMap, token_fn: &mut impl FnMut(FieldId, u16, &str), ) -> Result<()> { let mut field_position = HashMap::new(); + let mut field_name = String::new(); for (field_id, field_bytes) in obkv { - let Some(field_name) = field_id_map.name(field_id) else { + let Some(field_name) = field_id_map.name(field_id).map(|s| { + field_name.clear(); + field_name.push_str(s); + &field_name + }) else { unreachable!("field id not found in field id map"); }; let mut tokenize_field = |name: &str, value: &Value| { - let Some(field_id) = field_id_map.id(name) else { - unreachable!("field name not found in field id map"); + let Some(field_id) = field_id_map.id_or_insert(name) else { + /// TODO: better error + panic!("it's over 9000"); }; let position = @@ -75,7 +84,7 @@ impl<'a> DocumentTokenizer<'a> { // if the current field is searchable or contains a searchable attribute if self.searchable_attributes.map_or(true, |attributes| { - attributes.iter().any(|name| perm_json_p::contained_in(name, field_name)) + attributes.iter().any(|name| perm_json_p::contained_in(name, &field_name)) }) { // parse json. match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { @@ -224,11 +233,12 @@ mod perm_json_p { #[cfg(test)] mod test { - use super::*; use charabia::TokenizerBuilder; use meili_snap::snapshot; use obkv::KvReader; use serde_json::json; + + use super::*; #[test] fn test_tokenize_document() { let mut fields_ids_map = FieldsIdsMap::new(); diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index 3444d58f7..b4336c14a 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -27,6 +27,7 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion { fn document_changes( self, + _fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, ) -> Result> + Clone + 'p> { let (index, fields, primary_key) = param; diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index f5dcfcfe6..c54ffd140 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -73,13 +73,14 @@ impl DocumentOperation { } impl<'p> DocumentChanges<'p> for DocumentOperation { - type Parameter = (&'p Index, &'p RoTxn<'p>, &'p mut FieldsIdsMap, &'p PrimaryKey<'p>); + type Parameter = (&'p Index, &'p RoTxn<'p>, &'p PrimaryKey<'p>); fn document_changes( self, + fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, ) -> Result> + Clone + 'p> { - let (index, rtxn, fields_ids_map, primary_key) = param; + let (index, rtxn, primary_key) = param; let documents_ids = index.documents_ids(rtxn)?; let mut available_docids = AvailableIds::new(&documents_ids); @@ -174,7 +175,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { /// TODO is it the best way to provide FieldsIdsMap to the parallel iterator? let fields_ids_map = fields_ids_map.clone(); - // We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone + // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone let docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect(); Ok(docids_version_offsets diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index ebbb8582c..50bb5a401 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -1,4 +1,5 @@ use std::fs::File; +use std::sync::RwLock; use std::thread::{self, Builder}; use big_s::S; @@ -22,7 +23,7 @@ use crate::documents::{ obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY, }; use crate::update::GrenadParameters; -use crate::{Index, Result, UserError}; +use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; mod document_deletion; mod document_operation; @@ -34,6 +35,7 @@ pub trait DocumentChanges<'p> { fn document_changes( self, + fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, ) -> Result> + Clone + 'p>; } @@ -46,6 +48,7 @@ pub trait DocumentChanges<'p> { pub fn index( wtxn: &mut RwTxn, index: &Index, + fields_ids_map: FieldsIdsMap, pool: &ThreadPool, document_changes: PI, ) -> Result<()> @@ -57,6 +60,9 @@ where let ExtractorsMergerChannels { merger_receiver, deladd_cbo_roaring_bitmap_sender } = extractors_merger_channels(100); + let fields_ids_map_lock = RwLock::new(fields_ids_map); + let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); + thread::scope(|s| { // TODO manage the errors correctly let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { @@ -65,7 +71,7 @@ where // word docids let merger = WordDocidsExtractor::run_extraction( index, - todo!(), + &global_fields_ids_map, /// TODO: GrenadParameters::default() should be removed in favor a passed parameter GrenadParameters::default(), document_changes.clone(), @@ -100,8 +106,13 @@ where handle.join().unwrap()?; handle2.join().unwrap()?; - Ok(()) - }) + Ok(()) as Result<_> + })?; + + let fields_ids_map = fields_ids_map_lock.into_inner().unwrap(); + index.put_fields_ids_map(wtxn, &fields_ids_map)?; + + Ok(()) } /// TODO move this elsewhere diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 6699a6ba7..fe49ffdd7 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -30,6 +30,7 @@ where /// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items). fn document_changes( self, + _fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, ) -> Result> + Clone + 'p> { let (fields_ids_map, concurrent_available_ids, primary_key) = param; diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index fc908e31a..36ff432f8 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -2,7 +2,7 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use super::DocumentChanges; use crate::update::new::DocumentChange; -use crate::Result; +use crate::{FieldsIdsMap, Result}; pub struct UpdateByFunction; @@ -11,6 +11,7 @@ impl<'p> DocumentChanges<'p> for UpdateByFunction { fn document_changes( self, + _fields_ids_map: &mut FieldsIdsMap, _param: Self::Parameter, ) -> Result> + Clone + 'p> { Ok((0..100).into_par_iter().map(|_| todo!())) From fe69385bd75ff90d15889a2d5d675cfc00b825ba Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 3 Sep 2024 14:24:37 +0200 Subject: [PATCH 025/247] Fix tokenizer test --- milli/src/update/new/extract/tokenize_document.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/milli/src/update/new/extract/tokenize_document.rs b/milli/src/update/new/extract/tokenize_document.rs index 9f0a1c4d8..1494dd4b2 100644 --- a/milli/src/update/new/extract/tokenize_document.rs +++ b/milli/src/update/new/extract/tokenize_document.rs @@ -270,10 +270,6 @@ mod test { let value = obkv.into_inner().unwrap(); let obkv = KvReader::from_slice(value.as_slice()); - fields_ids_map.insert("doggo.age"); - fields_ids_map.insert("catto.catto.name"); - fields_ids_map.insert("catto.catto.age"); - let mut tb = TokenizerBuilder::default(); let document_tokenizer = DocumentTokenizer { tokenizer: &tb.build(), @@ -282,9 +278,12 @@ mod test { max_positions_per_attributes: 1000, }; + let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map); + let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); + let mut words = std::collections::BTreeMap::new(); document_tokenizer - .tokenize_document(obkv, &fields_ids_map, &mut |fid, pos, word| { + .tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| { words.insert([fid, pos], word.to_string()); }) .unwrap(); From da61408e529a7d92315b9c149e6a9cf320ef9860 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 3 Sep 2024 15:14:16 +0200 Subject: [PATCH 026/247] Remove unimplemented from document changes --- milli/src/update/new/document_change.rs | 22 +++++++++++++------ .../update/new/extract/extract_word_docids.rs | 4 ++-- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index 9076f32db..b4eb4d1d2 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -2,7 +2,7 @@ use heed::RoTxn; use obkv::KvReader; use crate::update::new::KvReaderFieldId; -use crate::{DocumentId, FieldId, Index}; +use crate::{DocumentId, FieldId, Index, Result}; pub enum DocumentChange { Deletion(Deletion), @@ -52,8 +52,12 @@ impl Deletion { self.docid } - pub fn current(&self, rtxn: &RoTxn, index: &Index) -> &KvReader { - unimplemented!() + pub fn current<'a>( + &self, + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + index.documents.get(rtxn, &self.docid).map_err(crate::Error::from) } } @@ -67,7 +71,7 @@ impl Insertion { } pub fn new(&self) -> &KvReader { - unimplemented!() + self.new.as_ref() } } @@ -85,11 +89,15 @@ impl Update { self.docid } - pub fn current(&self, rtxn: &RoTxn, index: &Index) -> &KvReader { - unimplemented!() + pub fn current<'a>( + &self, + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + index.documents.get(rtxn, &self.docid).map_err(crate::Error::from) } pub fn new(&self) -> &KvReader { - unimplemented!() + self.new.as_ref() } } diff --git a/milli/src/update/new/extract/extract_word_docids.rs b/milli/src/update/new/extract/extract_word_docids.rs index 1f52ee086..55f13f221 100644 --- a/milli/src/update/new/extract/extract_word_docids.rs +++ b/milli/src/update/new/extract/extract_word_docids.rs @@ -115,7 +115,7 @@ impl SearchableExtractor for WordDocidsExtractor { cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap(); }; document_tokenizer.tokenize_document( - inner.current(rtxn, index), + inner.current(rtxn, index)?.unwrap(), fields_ids_map, &mut token_fn, )?; @@ -125,7 +125,7 @@ impl SearchableExtractor for WordDocidsExtractor { cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap(); }; document_tokenizer.tokenize_document( - inner.current(rtxn, index), + inner.current(rtxn, index)?.unwrap(), fields_ids_map, &mut token_fn, )?; From 52d32b4ee9ddf3ed62050ac1ad0da77561406da4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 3 Sep 2024 16:08:33 +0200 Subject: [PATCH 027/247] Move the channel sender in the closure to stop the merger thread --- index-scheduler/src/batch.rs | 28 ++++++++++++++-------------- milli/src/update/new/indexer/mod.rs | 4 ++-- milli/src/update/new/merger.rs | 1 - 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index ecb44fc14..129dbec10 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -22,21 +22,20 @@ use std::ffi::OsStr; use std::fmt; use std::fs::{self, File}; use std::io::BufWriter; -use std::sync::RwLock; use dump::IndexMetadata; use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; -use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; +use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::new::indexer::{self, guess_primary_key, DocumentChanges}; use meilisearch_types::milli::update::{ - self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, + IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; -use meilisearch_types::milli::{self, Filter, GlobalFieldsIdsMap, Object, UserError}; +use meilisearch_types::milli::{self, Filter, Object}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; @@ -1364,17 +1363,18 @@ impl IndexScheduler { indexer::index(index_wtxn, index, fields_ids_map, &pool, document_changes)?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); - } else if primary_key_has_been_set { - // Everything failed but we've set a primary key. - // We need to remove it. - let mut builder = - milli::update::Settings::new(index_wtxn, index, indexer_config); - builder.reset_primary_key(); - builder.execute( - |indexing_step| tracing::trace!(update = ?indexing_step), - || must_stop_processing.clone().get(), - )?; } + // else if primary_key_has_been_set { + // // Everything failed but we've set a primary key. + // // We need to remove it. + // let mut builder = + // milli::update::Settings::new(index_wtxn, index, indexer_config); + // builder.reset_primary_key(); + // builder.execute( + // |indexing_step| tracing::trace!(update = ?indexing_step), + // || must_stop_processing.clone().get(), + // )?; + // } Ok(tasks) } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 50bb5a401..1b763f5f9 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -65,7 +65,7 @@ where thread::scope(|s| { // TODO manage the errors correctly - let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { + let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { let document_changes = document_changes.into_par_iter(); // word docids @@ -85,7 +85,7 @@ where })?; // TODO manage the errors correctly - let handle2 = Builder::new().name(S("indexer-merger")).spawn_scoped(s, || { + let handle2 = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || { let rtxn = index.read_txn().unwrap(); merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index) })?; diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 89d0762f0..e07262de8 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -6,7 +6,6 @@ use super::channel::{MergerReceiver, MergerSender}; use super::KvReaderDelAdd; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; -use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{CboRoaringBitmapCodec, Index, Result}; /// TODO We must return some infos/stats From 27b4cab8575f8a7f84ef0709c7645c142bd72f02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 4 Sep 2024 09:59:19 +0200 Subject: [PATCH 028/247] Extract and write the documents and words fst in the database --- milli/src/update/new/channel.rs | 133 ++++++++++++++++-- .../update/new/extract/extract_word_docids.rs | 6 +- .../update/new/extract/tokenize_document.rs | 8 +- milli/src/update/new/indexer/mod.rs | 44 ++++-- milli/src/update/new/merger.rs | 72 +++++++++- 5 files changed, 225 insertions(+), 38 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index d94b2cc00..d5739a75e 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -5,6 +5,7 @@ use grenad::Merger; use heed::types::Bytes; use super::StdResult; +use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY}; use crate::update::new::KvReaderFieldId; use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{DocumentId, Index}; @@ -22,12 +23,14 @@ pub fn extractors_merger_channels(cap: usize) -> ExtractorsMergerChannels { ExtractorsMergerChannels { merger_receiver: MergerReceiver(receiver), deladd_cbo_roaring_bitmap_sender: DeladdCboRoaringBitmapSender(sender.clone()), + extracted_documents_sender: ExtractedDocumentsSender(sender.clone()), } } pub struct ExtractorsMergerChannels { pub merger_receiver: MergerReceiver, pub deladd_cbo_roaring_bitmap_sender: DeladdCboRoaringBitmapSender, + pub extracted_documents_sender: ExtractedDocumentsSender, } pub struct KeyValueEntry { @@ -95,18 +98,37 @@ impl DocumentEntry { } } -pub enum WriterOperation { - WordDocids(EntryOperation), - Document(DocumentEntry), +pub struct DocumentDeletionEntry(DocumentId); + +impl DocumentDeletionEntry { + pub fn key(&self) -> [u8; 4] { + self.0.to_be_bytes() + } +} + +pub struct WriterOperation { + database: Database, + entry: EntryOperation, +} + +pub enum Database { + WordDocids, + Documents, + Main, } impl WriterOperation { pub fn database(&self, index: &Index) -> heed::Database { - match self { - WriterOperation::WordDocids(_) => index.word_docids.remap_types(), - WriterOperation::Document(_) => index.documents.remap_types(), + match self.database { + Database::Main => index.main.remap_types(), + Database::Documents => index.documents.remap_types(), + Database::WordDocids => index.word_docids.remap_types(), } } + + pub fn entry(self) -> EntryOperation { + self.entry + } } pub struct WriterReceiver(Receiver); @@ -123,37 +145,93 @@ impl IntoIterator for WriterReceiver { pub struct MergerSender(Sender); impl MergerSender { + pub fn main(&self) -> MainSender<'_> { + MainSender(&self.0) + } + pub fn word_docids(&self) -> WordDocidsSender<'_> { WordDocidsSender(&self.0) } + + pub fn documents(&self) -> DocumentsSender<'_> { + DocumentsSender(&self.0) + } + + pub fn send_documents_ids(&self, bitmap: &[u8]) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Write(KeyValueEntry::from_key_value( + DOCUMENTS_IDS_KEY.as_bytes(), + bitmap, + )); + match self.0.send(WriterOperation { database: Database::Main, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } +} + +pub struct MainSender<'a>(&'a Sender); + +impl MainSender<'_> { + pub fn write_words_fst(&self, value: &[u8]) -> StdResult<(), SendError<()>> { + let entry = + EntryOperation::Write(KeyValueEntry::from_key_value(WORDS_FST_KEY.as_bytes(), value)); + match self.0.send(WriterOperation { database: Database::Main, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Delete(KeyEntry::from_key(key)); + match self.0.send(WriterOperation { database: Database::Main, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } } pub struct WordDocidsSender<'a>(&'a Sender); impl WordDocidsSender<'_> { pub fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let operation = EntryOperation::Write(KeyValueEntry::from_key_value(key, value)); - match self.0.send(WriterOperation::WordDocids(operation)) { + let entry = EntryOperation::Write(KeyValueEntry::from_key_value(key, value)); + match self.0.send(WriterOperation { database: Database::WordDocids, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } } pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { - let operation = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.0.send(WriterOperation::WordDocids(operation)) { + let entry = EntryOperation::Delete(KeyEntry::from_key(key)); + match self.0.send(WriterOperation { database: Database::WordDocids, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } } } -#[derive(Clone)] -pub struct DocumentSender(Sender); +pub struct DocumentsSender<'a>(&'a Sender); -impl DocumentSender { - pub fn send(&self, document: DocumentEntry) -> StdResult<(), SendError<()>> { - match self.0.send(WriterOperation::Document(document)) { +impl DocumentsSender<'_> { + /// TODO do that efficiently + pub fn uncompressed( + &self, + docid: DocumentId, + document: &KvReaderFieldId, + ) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Write(KeyValueEntry::from_key_value( + &docid.to_be_bytes(), + document.as_bytes(), + )); + match self.0.send(WriterOperation { database: Database::Documents, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes())); + match self.0.send(WriterOperation { database: Database::Documents, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -162,6 +240,8 @@ impl DocumentSender { pub enum MergerOperation { WordDocidsMerger(Merger), + InsertDocument { docid: DocumentId, document: Box }, + DeleteDocument { docid: DocumentId }, } pub struct MergerReceiver(Receiver); @@ -190,3 +270,26 @@ impl DeladdCboRoaringBitmapSender { } } } + +#[derive(Clone)] +pub struct ExtractedDocumentsSender(Sender); + +impl ExtractedDocumentsSender { + pub fn insert( + &self, + docid: DocumentId, + document: Box, + ) -> StdResult<(), SendError<()>> { + match self.0.send(MergerOperation::InsertDocument { docid, document }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { + match self.0.send(MergerOperation::DeleteDocument { docid }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } +} diff --git a/milli/src/update/new/extract/extract_word_docids.rs b/milli/src/update/new/extract/extract_word_docids.rs index 55f13f221..cbb28b956 100644 --- a/milli/src/update/new/extract/extract_word_docids.rs +++ b/milli/src/update/new/extract/extract_word_docids.rs @@ -1,15 +1,15 @@ use std::fs::File; use charabia::TokenizerBuilder; -use grenad::{Merger, ReaderCursor}; +use grenad::Merger; use heed::RoTxn; -use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; use super::cache::CachedSorter; use super::tokenize_document::DocumentTokenizer; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; pub trait SearchableExtractor { fn run_extraction( diff --git a/milli/src/update/new/extract/tokenize_document.rs b/milli/src/update/new/extract/tokenize_document.rs index 1494dd4b2..ed4e6b89d 100644 --- a/milli/src/update/new/extract/tokenize_document.rs +++ b/milli/src/update/new/extract/tokenize_document.rs @@ -1,13 +1,11 @@ use std::collections::HashMap; -use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; -use heed::RoTxn; +use charabia::{SeparatorKind, Token, TokenKind, Tokenizer}; use serde_json::Value; use crate::update::new::KvReaderFieldId; use crate::{ - FieldId, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule, - Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, + FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, MAX_WORD_LENGTH, }; pub struct DocumentTokenizer<'a> { @@ -239,6 +237,8 @@ mod test { use serde_json::json; use super::*; + use crate::FieldsIdsMap; + #[test] fn test_tokenize_document() { let mut fields_ids_map = FieldsIdsMap::new(); diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 1b763f5f9..7a9999c28 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -13,7 +13,6 @@ pub use update_by_function::UpdateByFunction; use super::channel::{ extractors_merger_channels, merger_writer_channel, EntryOperation, ExtractorsMergerChannels, - WriterOperation, }; use super::document_change::DocumentChange; use super::extract::{SearchableExtractor, WordDocidsExtractor}; @@ -57,8 +56,11 @@ where PI::Iter: Clone, { let (merger_sender, writer_receiver) = merger_writer_channel(100); - let ExtractorsMergerChannels { merger_receiver, deladd_cbo_roaring_bitmap_sender } = - extractors_merger_channels(100); + let ExtractorsMergerChannels { + merger_receiver, + deladd_cbo_roaring_bitmap_sender, + extracted_documents_sender, + } = extractors_merger_channels(100); let fields_ids_map_lock = RwLock::new(fields_ids_map); let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); @@ -68,6 +70,28 @@ where let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { let document_changes = document_changes.into_par_iter(); + + // document but we need to create a function that collects and compresses documents. + document_changes.clone().into_par_iter().try_for_each(|result| { + match result? { + DocumentChange::Deletion(deletion) => { + let docid = deletion.docid(); + extracted_documents_sender.delete(docid).unwrap(); + } + DocumentChange::Update(update) => { + let docid = update.docid(); + let content = update.new(); + extracted_documents_sender.insert(docid, content.boxed()).unwrap(); + } + DocumentChange::Insertion(insertion) => { + let docid = insertion.docid(); + let content = insertion.new(); + extracted_documents_sender.insert(docid, content.boxed()).unwrap(); + } + } + Ok(()) as Result<_> + })?; + // word docids let merger = WordDocidsExtractor::run_extraction( index, @@ -90,15 +114,15 @@ where merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index) })?; - // TODO Split this code into another function for operation in writer_receiver { let database = operation.database(index); - match operation { - WriterOperation::WordDocids(operation) => match operation { - EntryOperation::Delete(e) => database.delete(wtxn, e.entry()).map(drop)?, - EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, - }, - WriterOperation::Document(e) => database.put(wtxn, &e.key(), e.content())?, + match operation.entry() { + EntryOperation::Delete(e) => { + if !database.delete(wtxn, e.entry())? { + unreachable!("We tried to delete an unknown key") + } + } + EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, } } diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index e07262de8..b21f20b0f 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -1,9 +1,14 @@ +use fst::set::OpBuilder; +use fst::{Set, SetBuilder}; use heed::types::Bytes; use heed::RoTxn; +use memmap2::Mmap; use roaring::RoaringBitmap; +use tempfile::tempfile; use super::channel::{MergerReceiver, MergerSender}; use super::KvReaderDelAdd; +use crate::index::main_key::WORDS_FST_KEY; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::{CboRoaringBitmapCodec, Index, Result}; @@ -16,12 +21,15 @@ pub fn merge_grenad_entries( index: &Index, ) -> Result<()> { let mut buffer = Vec::new(); + let mut documents_ids = index.documents_ids(rtxn)?; for merger_operation in receiver { match merger_operation { MergerOperation::WordDocidsMerger(merger) => { - let sender = sender.word_docids(); + let word_docids_sender = sender.word_docids(); let database = index.word_docids.remap_types::(); + let mut add_words_fst = SetBuilder::new(tempfile()?)?; + let mut del_words_fst = SetBuilder::new(tempfile()?)?; /// TODO manage the error correctly let mut merger_iter = merger.into_stream_merger_iter().unwrap(); @@ -35,17 +43,62 @@ pub fn merge_grenad_entries( match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { - let value = cbo_serialize_into_vec(&bitmap, &mut buffer); - sender.write(key, value).unwrap(); + let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); + word_docids_sender.write(key, value).unwrap(); + add_words_fst.insert(key)?; + } + Operation::Delete => { + word_docids_sender.delete(key).unwrap(); + del_words_fst.insert(key)?; } - Operation::Delete => sender.delete(key).unwrap(), Operation::Ignore => (), } } + + // Move that into a dedicated function + let words_fst = index.words_fst(rtxn)?; + + let add_words_fst_file = add_words_fst.into_inner()?; + let add_words_fst_mmap = unsafe { Mmap::map(&add_words_fst_file)? }; + let add_words_fst = Set::new(&add_words_fst_mmap)?; + + let del_words_fst_file = del_words_fst.into_inner()?; + let del_words_fst_mmap = unsafe { Mmap::map(&del_words_fst_file)? }; + let del_words_fst = Set::new(&del_words_fst_mmap)?; + + // TO BE IMPROVED @many + let diff = words_fst.op().add(&del_words_fst).difference(); + let stream = add_words_fst.op().add(diff).union(); + + let mut words_fst = SetBuilder::new(tempfile()?)?; + words_fst.extend_stream(stream)?; + let words_fst_file = words_fst.into_inner()?; + let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; + + // PLEASE SEND THIS AS AN MMAP + let main_sender = sender.main(); + main_sender.write_words_fst(&words_fst_mmap).unwrap(); + } + MergerOperation::InsertDocument { docid, document } => { + documents_ids.insert(docid); + sender.documents().uncompressed(docid, &document).unwrap(); + } + MergerOperation::DeleteDocument { docid } => { + if !documents_ids.remove(docid) { + unreachable!("Tried deleting a document that we do not know about"); + } + sender.documents().delete(docid).unwrap(); } } } + // Send the documents ids unionized with the current one + /// TODO return the slice of bytes directly + serialize_bitmap_into_vec(&documents_ids, &mut buffer); + sender.send_documents_ids(&buffer).unwrap(); + + // ... + Ok(()) } @@ -86,9 +139,16 @@ fn merge_cbo_bitmaps( } } -/// Return the slice directly from the serialize_into method -fn cbo_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) -> &'b [u8] { +/// TODO Return the slice directly from the serialize_into method +fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) -> &'b [u8] { buffer.clear(); CboRoaringBitmapCodec::serialize_into(bitmap, buffer); buffer.as_slice() } + +/// TODO Return the slice directly from the serialize_into method +fn serialize_bitmap_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) { + buffer.clear(); + bitmap.serialize_into(buffer).unwrap(); + // buffer.as_slice() +} From 6a399556b5581d146bec5a03e0b0cd87b9ab5fc6 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 4 Sep 2024 10:20:18 +0200 Subject: [PATCH 029/247] Implement more searchable extractor --- milli/src/update/new/extract/mod.rs | 7 +- .../extract/searchable/extract_word_docids.rs | 100 +++++ .../src/update/new/extract/searchable/mod.rs | 156 ++++++++ .../extract/searchable/tokenize_document.rs | 364 ++++++++++++++++++ 4 files changed, 623 insertions(+), 4 deletions(-) create mode 100644 milli/src/update/new/extract/searchable/extract_word_docids.rs create mode 100644 milli/src/update/new/extract/searchable/mod.rs create mode 100644 milli/src/update/new/extract/searchable/tokenize_document.rs diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 3124068d9..5e6c02c65 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -1,6 +1,5 @@ mod cache; -mod extract_word_docids; -mod tokenize_document; +mod searchable; -pub use extract_word_docids::SearchableExtractor; -pub use extract_word_docids::WordDocidsExtractor; +pub use searchable::SearchableExtractor; +pub use searchable::WordDocidsExtractor; diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs new file mode 100644 index 000000000..f8b495538 --- /dev/null +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -0,0 +1,100 @@ +use std::borrow::Cow; + +use heed::RoTxn; + +use super::SearchableExtractor; +use crate::{bucketed_position, FieldId, Index, Result}; + +pub struct WordDocidsExtractor; +impl SearchableExtractor for WordDocidsExtractor { + fn attributes_to_extract<'a>( + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + index.user_defined_searchable_fields(rtxn).map_err(Into::into) + } + + fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + // exact attributes must be skipped and stored in a separate DB, see `ExactWordDocidsExtractor`. + index.exact_attributes(rtxn).map_err(Into::into) + } + + fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { + Cow::Borrowed(word.as_bytes()) + } +} + +pub struct ExactWordDocidsExtractor; +impl SearchableExtractor for ExactWordDocidsExtractor { + fn attributes_to_extract<'a>( + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + let exact_attributes = index.exact_attributes(rtxn)?; + // If there are no user-defined searchable fields, we return all exact attributes. + // Otherwise, we return the intersection of exact attributes and user-defined searchable fields. + if let Some(searchable_attributes) = index.user_defined_searchable_fields(rtxn)? { + let attributes = exact_attributes + .into_iter() + .filter(|attr| searchable_attributes.contains(attr)) + .collect(); + Ok(Some(attributes)) + } else { + Ok(Some(exact_attributes)) + } + } + + fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { + Ok(vec![]) + } + + fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { + Cow::Borrowed(word.as_bytes()) + } +} + +pub struct WordFidDocidsExtractor; +impl SearchableExtractor for WordFidDocidsExtractor { + fn attributes_to_extract<'a>( + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + index.user_defined_searchable_fields(rtxn).map_err(Into::into) + } + + fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { + Ok(vec![]) + } + + fn build_key<'a>(field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { + let mut key = Vec::new(); + key.extend_from_slice(word.as_bytes()); + key.push(0); + key.extend_from_slice(&field_id.to_be_bytes()); + Cow::Owned(key) + } +} + +pub struct WordPositionDocidsExtractor; +impl SearchableExtractor for WordPositionDocidsExtractor { + fn attributes_to_extract<'a>( + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + index.user_defined_searchable_fields(rtxn).map_err(Into::into) + } + + fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { + Ok(vec![]) + } + + fn build_key<'a>(_field_id: FieldId, position: u16, word: &'a str) -> Cow<'a, [u8]> { + // position must be bucketed to reduce the number of keys in the DB. + let position = bucketed_position(position); + let mut key = Vec::new(); + key.extend_from_slice(word.as_bytes()); + key.push(0); + key.extend_from_slice(&position.to_be_bytes()); + Cow::Owned(key) + } +} diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs new file mode 100644 index 000000000..106455a7b --- /dev/null +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -0,0 +1,156 @@ +mod extract_word_docids; +mod tokenize_document; + +pub use extract_word_docids::{ + ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor, + WordPositionDocidsExtractor, +}; +use std::borrow::Cow; +use std::fs::File; + +use grenad::Merger; +use heed::RoTxn; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; + +use super::cache::CachedSorter; +use crate::update::new::{DocumentChange, ItemsPool}; +use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use tokenize_document::{tokenizer_builder, DocumentTokenizer}; + +pub trait SearchableExtractor { + fn run_extraction( + index: &Index, + fields_ids_map: &GlobalFieldsIdsMap, + indexer: GrenadParameters, + document_changes: impl IntoParallelIterator>, + ) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let rtxn = index.read_txn()?; + let stop_words = index.stop_words(&rtxn)?; + let allowed_separators = index.allowed_separators(&rtxn)?; + let allowed_separators: Option> = + allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let dictionary = index.dictionary(&rtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let builder = tokenizer_builder( + stop_words.as_ref(), + allowed_separators.as_deref(), + dictionary.as_deref(), + ); + let tokenizer = builder.into_tokenizer(); + + let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; + let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?; + let localized_attributes_rules = + index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); + + let document_tokenizer = DocumentTokenizer { + tokenizer: &tokenizer, + attribute_to_extract: attributes_to_extract.as_deref(), + attribute_to_skip: attributes_to_skip.as_slice(), + localized_attributes_rules: &localized_attributes_rules, + max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, + }; + + let context_pool = ItemsPool::new(|| { + Ok(( + index.read_txn()?, + &document_tokenizer, + fields_ids_map.clone(), + CachedSorter::new( + // TODO use a better value + 100.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + ), + )) + }); + + document_changes.into_par_iter().try_for_each(|document_change| { + context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + document_tokenizer, + fields_ids_map, + cached_sorter, + document_change?, + ) + }) + })?; + + let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); + for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() { + let sorter = cache.into_sorter()?; + let readers = sorter.into_reader_cursors()?; + builder.extend(readers); + } + + Ok(builder.build()) + } + + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + document_tokenizer: &DocumentTokenizer, + fields_ids_map: &mut GlobalFieldsIdsMap, + cached_sorter: &mut CachedSorter, + document_change: DocumentChange, + ) -> Result<()> { + match document_change { + DocumentChange::Deletion(inner) => { + let mut token_fn = |fid, pos: u16, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut token_fn, + )?; + } + DocumentChange::Update(inner) => { + let mut token_fn = |fid, pos, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut token_fn, + )?; + + let mut token_fn = |fid, pos, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + } + DocumentChange::Insertion(inner) => { + let mut token_fn = |fid, pos, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + } + } + + Ok(()) + } + + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) + -> Result>>; + + fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; + + fn build_key<'a>(field_id: FieldId, position: u16, word: &'a str) -> Cow<'a, [u8]>; +} diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs new file mode 100644 index 000000000..e20e52406 --- /dev/null +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -0,0 +1,364 @@ +use std::collections::HashMap; + +use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; +use heed::RoTxn; +use serde_json::Value; + +use crate::update::new::KvReaderFieldId; +use crate::{ + FieldId, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule, + Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, +}; + +pub struct DocumentTokenizer<'a> { + pub tokenizer: &'a Tokenizer<'a>, + pub attribute_to_extract: Option<&'a [&'a str]>, + pub attribute_to_skip: &'a [&'a str], + pub localized_attributes_rules: &'a [LocalizedAttributesRule], + pub max_positions_per_attributes: u32, +} + +impl<'a> DocumentTokenizer<'a> { + pub fn tokenize_document( + &self, + obkv: &KvReaderFieldId, + field_id_map: &mut GlobalFieldsIdsMap, + token_fn: &mut impl FnMut(FieldId, u16, &str), + ) -> Result<()> { + let mut field_position = HashMap::new(); + let mut field_name = String::new(); + for (field_id, field_bytes) in obkv { + let Some(field_name) = field_id_map.name(field_id).map(|s| { + field_name.clear(); + field_name.push_str(s); + &field_name + }) else { + unreachable!("field id not found in field id map"); + }; + + let mut tokenize_field = |name: &str, value: &Value| { + let Some(field_id) = field_id_map.id_or_insert(name) else { + /// TODO: better error + panic!("it's over 9000"); + }; + + let position = + field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0); + if *position as u32 >= self.max_positions_per_attributes { + return; + } + + match value { + Value::Number(n) => { + let token = n.to_string(); + if let Ok(position) = (*position).try_into() { + token_fn(field_id, position, token.as_str()); + } + } + Value::String(text) => { + // create an iterator of token with their positions. + let locales = self + .localized_attributes_rules + .iter() + .find(|rule| rule.match_str(field_name)) + .map(|rule| rule.locales()); + let tokens = process_tokens( + *position, + self.tokenizer.tokenize_with_allow_list(text.as_str(), locales), + ) + .take_while(|(p, _)| (*p as u32) < self.max_positions_per_attributes); + + for (index, token) in tokens { + // keep a word only if it is not empty and fit in a LMDB key. + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + *position = index; + if let Ok(position) = (*position).try_into() { + token_fn(field_id, position, token); + } + } + } + } + _ => (), + } + }; + + // if the current field is searchable or contains a searchable attribute + if perm_json_p::select_field( + &field_name, + self.attribute_to_extract.as_deref(), + self.attribute_to_skip, + ) { + // parse json. + match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { + Value::Object(object) => perm_json_p::seek_leaf_values_in_object( + &object, + self.attribute_to_extract.as_deref(), + self.attribute_to_skip, + &field_name, + &mut tokenize_field, + ), + Value::Array(array) => perm_json_p::seek_leaf_values_in_array( + &array, + self.attribute_to_extract.as_deref(), + self.attribute_to_skip, + &field_name, + &mut tokenize_field, + ), + value => tokenize_field(&field_name, &value), + } + } + } + Ok(()) + } +} + +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standard proximity of 1 between words. +fn process_tokens<'a>( + start_offset: usize, + tokens: impl Iterator>, +) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator()) + .scan((start_offset, None), |(offset, prev_kind), mut token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => + { + *prev_kind = Some(token.kind); + } + _ => token.kind = TokenKind::Unknown, + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} + +/// Factorize tokenizer building. +pub fn tokenizer_builder<'a>( + stop_words: Option<&'a fst::Set<&'a [u8]>>, + allowed_separators: Option<&'a [&str]>, + dictionary: Option<&'a [&str]>, +) -> TokenizerBuilder<'a, &'a [u8]> { + let mut tokenizer_builder = TokenizerBuilder::new(); + if let Some(stop_words) = stop_words { + tokenizer_builder.stop_words(stop_words); + } + if let Some(dictionary) = dictionary { + tokenizer_builder.words_dict(dictionary); + } + if let Some(separators) = allowed_separators { + tokenizer_builder.separators(separators); + } + + tokenizer_builder +} + +/// TODO move in permissive json pointer +mod perm_json_p { + use serde_json::{Map, Value}; + const SPLIT_SYMBOL: char = '.'; + + /// Returns `true` if the `selector` match the `key`. + /// + /// ```text + /// Example: + /// `animaux` match `animaux` + /// `animaux.chien` match `animaux` + /// `animaux.chien` match `animaux` + /// `animaux.chien.nom` match `animaux` + /// `animaux.chien.nom` match `animaux.chien` + /// ----------------------------------------- + /// `animaux` doesn't match `animaux.chien` + /// `animaux.` doesn't match `animaux` + /// `animaux.ch` doesn't match `animaux.chien` + /// `animau` doesn't match `animaux` + /// ``` + pub fn contained_in(selector: &str, key: &str) -> bool { + selector.starts_with(key) + && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) + } + + pub fn seek_leaf_values_in_object( + value: &Map, + selectors: Option<&[&str]>, + skip_selectors: &[&str], + base_key: &str, + seeker: &mut impl FnMut(&str, &Value), + ) { + for (key, value) in value.iter() { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, SPLIT_SYMBOL, key) + }; + + // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` + // so we check the contained_in on both side + let should_continue = select_field(&base_key, selectors, skip_selectors); + if should_continue { + match value { + Value::Object(object) => seek_leaf_values_in_object( + object, + selectors, + skip_selectors, + &base_key, + seeker, + ), + Value::Array(array) => seek_leaf_values_in_array( + array, + selectors, + skip_selectors, + &base_key, + seeker, + ), + value => seeker(&base_key, value), + } + } + } + } + + pub fn seek_leaf_values_in_array( + values: &[Value], + selectors: Option<&[&str]>, + skip_selectors: &[&str], + base_key: &str, + seeker: &mut impl FnMut(&str, &Value), + ) { + for value in values { + match value { + Value::Object(object) => { + seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker) + } + Value::Array(array) => { + seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker) + } + value => seeker(base_key, value), + } + } + } + + pub fn select_field( + field_name: &str, + selectors: Option<&[&str]>, + skip_selectors: &[&str], + ) -> bool { + selectors.map_or(true, |selectors| { + selectors.iter().any(|selector| { + contained_in(selector, &field_name) || contained_in(&field_name, selector) + }) + }) && !skip_selectors.iter().any(|skip_selector| { + contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector) + }) + } +} + +#[cfg(test)] +mod test { + use charabia::TokenizerBuilder; + use meili_snap::snapshot; + use obkv::KvReader; + use serde_json::json; + + use super::*; + #[test] + fn test_tokenize_document() { + let mut fields_ids_map = FieldsIdsMap::new(); + + let field_1 = json!({ + "name": "doggo", + "age": 10, + }); + + let field_2 = json!({ + "catto": { + "name": "pesti", + "age": 23, + } + }); + + let field_3 = json!(["doggo", "catto"]); + let field_4 = json!("UNSEARCHABLE"); + let field_5 = json!({"nope": "unsearchable"}); + + let mut obkv = obkv::KvWriter::memory(); + let field_1_id = fields_ids_map.insert("doggo").unwrap(); + let field_1 = serde_json::to_string(&field_1).unwrap(); + obkv.insert(field_1_id, field_1.as_bytes()).unwrap(); + let field_2_id = fields_ids_map.insert("catto").unwrap(); + let field_2 = serde_json::to_string(&field_2).unwrap(); + obkv.insert(field_2_id, field_2.as_bytes()).unwrap(); + let field_3_id = fields_ids_map.insert("doggo.name").unwrap(); + let field_3 = serde_json::to_string(&field_3).unwrap(); + obkv.insert(field_3_id, field_3.as_bytes()).unwrap(); + let field_4_id = fields_ids_map.insert("not-me").unwrap(); + let field_4 = serde_json::to_string(&field_4).unwrap(); + obkv.insert(field_4_id, field_4.as_bytes()).unwrap(); + let field_5_id = fields_ids_map.insert("me-nether").unwrap(); + let field_5 = serde_json::to_string(&field_5).unwrap(); + obkv.insert(field_5_id, field_5.as_bytes()).unwrap(); + let value = obkv.into_inner().unwrap(); + let obkv = KvReader::from_slice(value.as_slice()); + + let mut tb = TokenizerBuilder::default(); + let document_tokenizer = DocumentTokenizer { + tokenizer: &tb.build(), + attribute_to_extract: None, + attribute_to_skip: &["not-me", "me-nether.nope"], + localized_attributes_rules: &[], + max_positions_per_attributes: 1000, + }; + + let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map); + let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); + + let mut words = std::collections::BTreeMap::new(); + document_tokenizer + .tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| { + words.insert([fid, pos], word.to_string()); + }) + .unwrap(); + + snapshot!(format!("{:#?}", words), @r###" + { + [ + 2, + 0, + ]: "doggo", + [ + 2, + 8, + ]: "doggo", + [ + 2, + 16, + ]: "catto", + [ + 3, + 0, + ]: "10", + [ + 4, + 0, + ]: "pesti", + [ + 5, + 0, + ]: "23", + } + "###); + } +} From 781a186f757885d6fb2aaa65ac7cd1e7ff1b05bd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 4 Sep 2024 10:28:31 +0200 Subject: [PATCH 030/247] remove milli/src/update/new/extract/extract_word_docids.rs --- .../update/new/extract/extract_word_docids.rs | 168 ------------------ 1 file changed, 168 deletions(-) delete mode 100644 milli/src/update/new/extract/extract_word_docids.rs diff --git a/milli/src/update/new/extract/extract_word_docids.rs b/milli/src/update/new/extract/extract_word_docids.rs deleted file mode 100644 index cbb28b956..000000000 --- a/milli/src/update/new/extract/extract_word_docids.rs +++ /dev/null @@ -1,168 +0,0 @@ -use std::fs::File; - -use charabia::TokenizerBuilder; -use grenad::Merger; -use heed::RoTxn; -use rayon::iter::{IntoParallelIterator, ParallelIterator}; - -use super::cache::CachedSorter; -use super::tokenize_document::DocumentTokenizer; -use crate::update::new::{DocumentChange, ItemsPool}; -use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; - -pub trait SearchableExtractor { - fn run_extraction( - index: &Index, - fields_ids_map: &GlobalFieldsIdsMap, - indexer: GrenadParameters, - document_changes: impl IntoParallelIterator>, - ) -> Result> { - let max_memory = indexer.max_memory_by_thread(); - - let rtxn = index.read_txn()?; - let stop_words = index.stop_words(&rtxn)?; - let allowed_separators = index.allowed_separators(&rtxn)?; - let allowed_separators: Option> = - allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let dictionary = index.dictionary(&rtxn)?; - let dictionary: Option> = - dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let builder = tokenizer_builder( - stop_words.as_ref(), - allowed_separators.as_deref(), - dictionary.as_deref(), - ); - let tokenizer = builder.into_tokenizer(); - - let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn)?; - let localized_attributes_rules = - index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - - let document_tokenizer = DocumentTokenizer { - tokenizer: &tokenizer, - searchable_attributes: user_defined_searchable_fields.as_deref(), - localized_attributes_rules: &localized_attributes_rules, - max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, - }; - - let context_pool = ItemsPool::new(|| { - Ok(( - index.read_txn()?, - &document_tokenizer, - fields_ids_map.clone(), - CachedSorter::new( - // TODO use a better value - 100.try_into().unwrap(), - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ), - ), - )) - }); - - document_changes.into_par_iter().try_for_each(|document_change| { - context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { - Self::extract_document_change( - &*rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, - ) - }) - })?; - - let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); - for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() { - let sorter = cache.into_sorter()?; - let readers = sorter.into_reader_cursors()?; - builder.extend(readers); - } - - Ok(builder.build()) - } - - fn extract_document_change( - rtxn: &RoTxn, - index: &Index, - document_tokenizer: &DocumentTokenizer, - fields_ids_map: &mut GlobalFieldsIdsMap, - cached_sorter: &mut CachedSorter, - document_change: DocumentChange, - ) -> Result<()>; -} - -pub struct WordDocidsExtractor; -impl SearchableExtractor for WordDocidsExtractor { - fn extract_document_change( - rtxn: &RoTxn, - index: &Index, - document_tokenizer: &DocumentTokenizer, - fields_ids_map: &mut GlobalFieldsIdsMap, - cached_sorter: &mut CachedSorter, - document_change: DocumentChange, - ) -> crate::Result<()> { - match document_change { - DocumentChange::Deletion(inner) => { - let mut token_fn = |_fid, _pos: u16, word: &str| { - cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap(); - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - } - DocumentChange::Update(inner) => { - let mut token_fn = |_fid, _pos, word: &str| { - cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap(); - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - - let mut token_fn = |_fid, _pos, word: &str| { - cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap(); - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - } - DocumentChange::Insertion(inner) => { - let mut token_fn = |_fid, _pos, word: &str| { - cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap(); - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - } - } - - Ok(()) - } -} - -/// Factorize tokenizer building. -fn tokenizer_builder<'a>( - stop_words: Option<&'a fst::Set<&'a [u8]>>, - allowed_separators: Option<&'a [&str]>, - dictionary: Option<&'a [&str]>, -) -> TokenizerBuilder<'a, &'a [u8]> { - let mut tokenizer_builder = TokenizerBuilder::new(); - if let Some(stop_words) = stop_words { - tokenizer_builder.stop_words(stop_words); - } - if let Some(dictionary) = dictionary { - tokenizer_builder.words_dict(dictionary); - } - if let Some(separators) = allowed_separators { - tokenizer_builder.separators(separators); - } - - tokenizer_builder -} From 3b82d8b5b952dad4b5c323a25bb0a6d0896876f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 4 Sep 2024 10:55:06 +0200 Subject: [PATCH 031/247] Fix the cache to serialize entries correctly --- milli/src/update/new/extract/cache.rs | 69 ++++----------------------- 1 file changed, 9 insertions(+), 60 deletions(-) diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 878150eb3..684b67daa 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -1,13 +1,13 @@ -use std::borrow::Cow; use std::num::NonZeroUsize; -use std::{io, mem}; +use std::mem; use grenad::{MergeFunction, Sorter}; use lru::LruCache; use roaring::RoaringBitmap; use smallvec::SmallVec; -use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::del_add::{DelAdd, KvWriterDelAdd}; +use crate::CboRoaringBitmapCodec; #[derive(Debug)] pub struct CachedSorter { @@ -123,26 +123,27 @@ impl CachedSorter { key: A, deladd: DelAddRoaringBitmap, ) -> grenad::Result<(), MF::Error> { + /// TODO we must create a serialization trait to correctly serialize bitmaps self.deladd_buffer.clear(); let mut value_writer = KvWriterDelAdd::new(&mut self.deladd_buffer); match deladd { DelAddRoaringBitmap { del: Some(del), add: None } => { self.cbo_buffer.clear(); - RoaringBitmap::serialize_into(&del, &mut self.cbo_buffer)?; + CboRoaringBitmapCodec::serialize_into(&del, &mut self.cbo_buffer); value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: Some(add) } => { self.cbo_buffer.clear(); - RoaringBitmap::serialize_into(&add, &mut self.cbo_buffer)?; + CboRoaringBitmapCodec::serialize_into(&add, &mut self.cbo_buffer); value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?; } DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { self.cbo_buffer.clear(); - RoaringBitmap::serialize_into(&del, &mut self.cbo_buffer)?; + CboRoaringBitmapCodec::serialize_into(&del, &mut self.cbo_buffer); value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?; self.cbo_buffer.clear(); - RoaringBitmap::serialize_into(&add, &mut self.cbo_buffer)?; + CboRoaringBitmapCodec::serialize_into(&add, &mut self.cbo_buffer); value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: None } => return Ok(()), @@ -193,56 +194,4 @@ impl DelAddRoaringBitmap { fn new_add_u32(n: u32) -> Self { DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) } } -} - -/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv -/// separately and outputs a new DelAdd with both unions. -pub struct DelAddRoaringBitmapMerger; - -impl MergeFunction for DelAddRoaringBitmapMerger { - type Error = io::Error; - - fn merge<'a>( - &self, - _key: &[u8], - values: &[Cow<'a, [u8]>], - ) -> std::result::Result, Self::Error> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - // Retrieve the bitmaps from both sides - let mut del_bitmaps_bytes = Vec::new(); - let mut add_bitmaps_bytes = Vec::new(); - for value in values { - let obkv: &KvReaderDelAdd = value.as_ref().into(); - if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { - del_bitmaps_bytes.push(bitmap_bytes); - } - if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { - add_bitmaps_bytes.push(bitmap_bytes); - } - } - - let mut output_deladd_obkv = KvWriterDelAdd::memory(); - - // Deletion - let mut buffer = Vec::new(); - let mut merged = RoaringBitmap::new(); - for bytes in del_bitmaps_bytes { - merged |= RoaringBitmap::deserialize_unchecked_from(bytes)?; - } - merged.serialize_into(&mut buffer)?; - output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; - - // Addition - buffer.clear(); - merged.clear(); - for bytes in add_bitmaps_bytes { - merged |= RoaringBitmap::deserialize_unchecked_from(bytes)?; - } - output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; - - output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) - } - } -} +} \ No newline at end of file From 1eb75a1040e6b8c68053d70e5e04670ba286c6da Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 4 Sep 2024 11:40:26 +0200 Subject: [PATCH 032/247] remove milli/src/update/new/extract/tokenize_document.rs --- .../update/new/extract/tokenize_document.rs | 320 ------------------ 1 file changed, 320 deletions(-) delete mode 100644 milli/src/update/new/extract/tokenize_document.rs diff --git a/milli/src/update/new/extract/tokenize_document.rs b/milli/src/update/new/extract/tokenize_document.rs deleted file mode 100644 index ed4e6b89d..000000000 --- a/milli/src/update/new/extract/tokenize_document.rs +++ /dev/null @@ -1,320 +0,0 @@ -use std::collections::HashMap; - -use charabia::{SeparatorKind, Token, TokenKind, Tokenizer}; -use serde_json::Value; - -use crate::update::new::KvReaderFieldId; -use crate::{ - FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, MAX_WORD_LENGTH, -}; - -pub struct DocumentTokenizer<'a> { - pub tokenizer: &'a Tokenizer<'a>, - pub searchable_attributes: Option<&'a [&'a str]>, - pub localized_attributes_rules: &'a [LocalizedAttributesRule], - pub max_positions_per_attributes: u32, -} - -impl<'a> DocumentTokenizer<'a> { - pub fn tokenize_document( - &self, - obkv: &KvReaderFieldId, - field_id_map: &mut GlobalFieldsIdsMap, - token_fn: &mut impl FnMut(FieldId, u16, &str), - ) -> Result<()> { - let mut field_position = HashMap::new(); - let mut field_name = String::new(); - for (field_id, field_bytes) in obkv { - let Some(field_name) = field_id_map.name(field_id).map(|s| { - field_name.clear(); - field_name.push_str(s); - &field_name - }) else { - unreachable!("field id not found in field id map"); - }; - - let mut tokenize_field = |name: &str, value: &Value| { - let Some(field_id) = field_id_map.id_or_insert(name) else { - /// TODO: better error - panic!("it's over 9000"); - }; - - let position = - field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0); - if *position as u32 >= self.max_positions_per_attributes { - return; - } - - match value { - Value::Number(n) => { - let token = n.to_string(); - if let Ok(position) = (*position).try_into() { - token_fn(field_id, position, token.as_str()); - } - } - Value::String(text) => { - // create an iterator of token with their positions. - let locales = self - .localized_attributes_rules - .iter() - .find(|rule| rule.match_str(field_name)) - .map(|rule| rule.locales()); - let tokens = process_tokens( - *position, - self.tokenizer.tokenize_with_allow_list(text.as_str(), locales), - ) - .take_while(|(p, _)| (*p as u32) < self.max_positions_per_attributes); - - for (index, token) in tokens { - // keep a word only if it is not empty and fit in a LMDB key. - let token = token.lemma().trim(); - if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - *position = index; - if let Ok(position) = (*position).try_into() { - token_fn(field_id, position, token); - } - } - } - } - _ => (), - } - }; - - // if the current field is searchable or contains a searchable attribute - if self.searchable_attributes.map_or(true, |attributes| { - attributes.iter().any(|name| perm_json_p::contained_in(name, &field_name)) - }) { - // parse json. - match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { - Value::Object(object) => perm_json_p::seek_leaf_values_in_object( - &object, - self.searchable_attributes.as_deref(), - &field_name, - &mut tokenize_field, - ), - Value::Array(array) => perm_json_p::seek_leaf_values_in_array( - &array, - self.searchable_attributes.as_deref(), - &field_name, - &mut tokenize_field, - ), - value => tokenize_field(&field_name, &value), - } - } - } - Ok(()) - } -} - -/// take an iterator on tokens and compute their relative position depending on separator kinds -/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, -/// else we keep the standard proximity of 1 between words. -fn process_tokens<'a>( - start_offset: usize, - tokens: impl Iterator>, -) -> impl Iterator)> { - tokens - .skip_while(|token| token.is_separator()) - .scan((start_offset, None), |(offset, prev_kind), mut token| { - match token.kind { - TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { - *offset += match *prev_kind { - Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, - Some(_) => 1, - None => 0, - }; - *prev_kind = Some(token.kind) - } - TokenKind::Separator(SeparatorKind::Hard) => { - *prev_kind = Some(token.kind); - } - TokenKind::Separator(SeparatorKind::Soft) - if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => - { - *prev_kind = Some(token.kind); - } - _ => token.kind = TokenKind::Unknown, - } - Some((*offset, token)) - }) - .filter(|(_, t)| t.is_word()) -} - -/// TODO move in permissive json pointer -mod perm_json_p { - use serde_json::{Map, Value}; - const SPLIT_SYMBOL: char = '.'; - - /// Returns `true` if the `selector` match the `key`. - /// - /// ```text - /// Example: - /// `animaux` match `animaux` - /// `animaux.chien` match `animaux` - /// `animaux.chien` match `animaux` - /// `animaux.chien.nom` match `animaux` - /// `animaux.chien.nom` match `animaux.chien` - /// ----------------------------------------- - /// `animaux` doesn't match `animaux.chien` - /// `animaux.` doesn't match `animaux` - /// `animaux.ch` doesn't match `animaux.chien` - /// `animau` doesn't match `animaux` - /// ``` - pub fn contained_in(selector: &str, key: &str) -> bool { - selector.starts_with(key) - && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) - } - - pub fn seek_leaf_values<'a>( - value: &Map, - selectors: impl IntoIterator, - seeker: &mut impl FnMut(&str, &Value), - ) { - let selectors: Vec<_> = selectors.into_iter().collect(); - seek_leaf_values_in_object(value, Some(&selectors), "", seeker); - } - - pub fn seek_leaf_values_in_object( - value: &Map, - selectors: Option<&[&str]>, - base_key: &str, - seeker: &mut impl FnMut(&str, &Value), - ) { - for (key, value) in value.iter() { - let base_key = if base_key.is_empty() { - key.to_string() - } else { - format!("{}{}{}", base_key, SPLIT_SYMBOL, key) - }; - - // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` - // so we check the contained_in on both side - let should_continue = selectors.map_or(true, |selectors| { - selectors.iter().any(|selector| { - contained_in(selector, &base_key) || contained_in(&base_key, selector) - }) - }); - - if should_continue { - match value { - Value::Object(object) => { - seek_leaf_values_in_object(object, selectors, &base_key, seeker) - } - Value::Array(array) => { - seek_leaf_values_in_array(array, selectors, &base_key, seeker) - } - value => seeker(&base_key, value), - } - } - } - } - - pub fn seek_leaf_values_in_array( - values: &[Value], - selectors: Option<&[&str]>, - base_key: &str, - seeker: &mut impl FnMut(&str, &Value), - ) { - for value in values { - match value { - Value::Object(object) => { - seek_leaf_values_in_object(object, selectors, base_key, seeker) - } - Value::Array(array) => { - seek_leaf_values_in_array(array, selectors, base_key, seeker) - } - value => seeker(base_key, value), - } - } - } -} - -#[cfg(test)] -mod test { - use charabia::TokenizerBuilder; - use meili_snap::snapshot; - use obkv::KvReader; - use serde_json::json; - - use super::*; - use crate::FieldsIdsMap; - - #[test] - fn test_tokenize_document() { - let mut fields_ids_map = FieldsIdsMap::new(); - - let field_1 = json!({ - "name": "doggo", - "age": 10, - }); - - let field_2 = json!({ - "catto": { - "name": "pesti", - "age": 23, - } - }); - - let field_3 = json!(["doggo", "catto"]); - - let mut obkv = obkv::KvWriter::memory(); - let field_1_id = fields_ids_map.insert("doggo").unwrap(); - let field_1 = serde_json::to_string(&field_1).unwrap(); - obkv.insert(field_1_id, field_1.as_bytes()).unwrap(); - let field_2_id = fields_ids_map.insert("catto").unwrap(); - let field_2 = serde_json::to_string(&field_2).unwrap(); - obkv.insert(field_2_id, field_2.as_bytes()).unwrap(); - let field_3_id = fields_ids_map.insert("doggo.name").unwrap(); - let field_3 = serde_json::to_string(&field_3).unwrap(); - obkv.insert(field_3_id, field_3.as_bytes()).unwrap(); - let value = obkv.into_inner().unwrap(); - let obkv = KvReader::from_slice(value.as_slice()); - - let mut tb = TokenizerBuilder::default(); - let document_tokenizer = DocumentTokenizer { - tokenizer: &tb.build(), - searchable_attributes: None, - localized_attributes_rules: &[], - max_positions_per_attributes: 1000, - }; - - let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map); - let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); - - let mut words = std::collections::BTreeMap::new(); - document_tokenizer - .tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| { - words.insert([fid, pos], word.to_string()); - }) - .unwrap(); - - snapshot!(format!("{:#?}", words), @r###" - { - [ - 2, - 0, - ]: "doggo", - [ - 2, - 8, - ]: "doggo", - [ - 2, - 16, - ]: "catto", - [ - 3, - 0, - ]: "10", - [ - 4, - 0, - ]: "pesti", - [ - 5, - 0, - ]: "23", - } - "###); - } -} From 6d74fb02294a10eb538257c779885f1ac5e5b8cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 4 Sep 2024 11:39:53 +0200 Subject: [PATCH 033/247] Introduce the WordFidWordDocids database --- milli/src/update/new/channel.rs | 104 +++++++++++++++++----------- milli/src/update/new/extract/mod.rs | 6 +- milli/src/update/new/indexer/mod.rs | 35 ++++++---- milli/src/update/new/merger.rs | 26 +++++++ 4 files changed, 116 insertions(+), 55 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index d5739a75e..acea02316 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -1,4 +1,5 @@ use std::fs::File; +use std::marker::PhantomData; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use grenad::Merger; @@ -17,20 +18,9 @@ pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) { } /// The capacity of the channel is currently in number of messages. -pub fn extractors_merger_channels(cap: usize) -> ExtractorsMergerChannels { +pub fn extractors_merger_channels(cap: usize) -> (ExtractorSender, MergerReceiver) { let (sender, receiver) = crossbeam_channel::bounded(cap); - - ExtractorsMergerChannels { - merger_receiver: MergerReceiver(receiver), - deladd_cbo_roaring_bitmap_sender: DeladdCboRoaringBitmapSender(sender.clone()), - extracted_documents_sender: ExtractedDocumentsSender(sender.clone()), - } -} - -pub struct ExtractorsMergerChannels { - pub merger_receiver: MergerReceiver, - pub deladd_cbo_roaring_bitmap_sender: DeladdCboRoaringBitmapSender, - pub extracted_documents_sender: ExtractedDocumentsSender, + (ExtractorSender(sender), MergerReceiver(receiver)) } pub struct KeyValueEntry { @@ -113,6 +103,7 @@ pub struct WriterOperation { pub enum Database { WordDocids, + WordFidDocids, Documents, Main, } @@ -123,6 +114,7 @@ impl WriterOperation { Database::Main => index.main.remap_types(), Database::Documents => index.documents.remap_types(), Database::WordDocids => index.word_docids.remap_types(), + Database::WordFidDocids => index.word_fid_docids.remap_types(), } } @@ -149,8 +141,12 @@ impl MergerSender { MainSender(&self.0) } - pub fn word_docids(&self) -> WordDocidsSender<'_> { - WordDocidsSender(&self.0) + pub fn word_docids(&self) -> DocidsSender<'_, WordDocids> { + DocidsSender { sender: &self.0, _marker: PhantomData } + } + + pub fn word_fid_docids(&self) -> DocidsSender<'_, WordFidDocids> { + DocidsSender { sender: &self.0, _marker: PhantomData } } pub fn documents(&self) -> DocumentsSender<'_> { @@ -190,12 +186,34 @@ impl MainSender<'_> { } } -pub struct WordDocidsSender<'a>(&'a Sender); +pub enum WordDocids {} +pub enum WordFidDocids {} -impl WordDocidsSender<'_> { +pub trait DatabaseType { + fn database() -> Database; +} + +impl DatabaseType for WordDocids { + fn database() -> Database { + Database::WordDocids + } +} + +impl DatabaseType for WordFidDocids { + fn database() -> Database { + Database::WordFidDocids + } +} + +pub struct DocidsSender<'a, D> { + sender: &'a Sender, + _marker: PhantomData, +} + +impl DocidsSender<'_, D> { pub fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Write(KeyValueEntry::from_key_value(key, value)); - match self.0.send(WriterOperation { database: Database::WordDocids, entry }) { + match self.sender.send(WriterOperation { database: D::database(), entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -203,7 +221,7 @@ impl WordDocidsSender<'_> { pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.0.send(WriterOperation { database: Database::WordDocids, entry }) { + match self.sender.send(WriterOperation { database: D::database(), entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -240,6 +258,7 @@ impl DocumentsSender<'_> { pub enum MergerOperation { WordDocidsMerger(Merger), + WordFidDocidsMerger(Merger), InsertDocument { docid: DocumentId, document: Box }, DeleteDocument { docid: DocumentId }, } @@ -255,27 +274,10 @@ impl IntoIterator for MergerReceiver { } } -#[derive(Clone)] -pub struct DeladdCboRoaringBitmapSender(Sender); +pub struct ExtractorSender(Sender); -impl DeladdCboRoaringBitmapSender { - pub fn word_docids( - &self, - merger: Merger, - ) -> StdResult<(), SendError<()>> { - let operation = MergerOperation::WordDocidsMerger(merger); - match self.0.send(operation) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } -} - -#[derive(Clone)] -pub struct ExtractedDocumentsSender(Sender); - -impl ExtractedDocumentsSender { - pub fn insert( +impl ExtractorSender { + pub fn document_insert( &self, docid: DocumentId, document: Box, @@ -286,10 +288,32 @@ impl ExtractedDocumentsSender { } } - pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { + pub fn document_delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { match self.0.send(MergerOperation::DeleteDocument { docid }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } } + + pub fn word_docids( + &self, + merger: Merger, + ) -> StdResult<(), SendError<()>> { + let operation = MergerOperation::WordDocidsMerger(merger); + match self.0.send(operation) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + pub fn word_fid_docids( + &self, + merger: Merger, + ) -> StdResult<(), SendError<()>> { + let operation = MergerOperation::WordFidDocidsMerger(merger); + match self.0.send(operation) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } } diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 5e6c02c65..1964b88fc 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -1,5 +1,7 @@ mod cache; mod searchable; -pub use searchable::SearchableExtractor; -pub use searchable::WordDocidsExtractor; +pub use searchable::{ + ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, + WordPositionDocidsExtractor, +}; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 7a9999c28..539b6d602 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -11,11 +11,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::ThreadPool; pub use update_by_function::UpdateByFunction; -use super::channel::{ - extractors_merger_channels, merger_writer_channel, EntryOperation, ExtractorsMergerChannels, -}; +use super::channel::{extractors_merger_channels, merger_writer_channel, EntryOperation}; use super::document_change::DocumentChange; -use super::extract::{SearchableExtractor, WordDocidsExtractor}; +use super::extract::{SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor}; use super::merger::merge_grenad_entries; use super::StdResult; use crate::documents::{ @@ -56,11 +54,8 @@ where PI::Iter: Clone, { let (merger_sender, writer_receiver) = merger_writer_channel(100); - let ExtractorsMergerChannels { - merger_receiver, - deladd_cbo_roaring_bitmap_sender, - extracted_documents_sender, - } = extractors_merger_channels(100); + // This channel acts as a rendezvous point to ensure that we are one task ahead + let (extractor_sender, merger_receiver) = extractors_merger_channels(0); let fields_ids_map_lock = RwLock::new(fields_ids_map); let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); @@ -76,17 +71,19 @@ where match result? { DocumentChange::Deletion(deletion) => { let docid = deletion.docid(); - extracted_documents_sender.delete(docid).unwrap(); + extractor_sender.document_delete(docid).unwrap(); } DocumentChange::Update(update) => { let docid = update.docid(); let content = update.new(); - extracted_documents_sender.insert(docid, content.boxed()).unwrap(); + extractor_sender.document_insert(docid, content.boxed()).unwrap(); } DocumentChange::Insertion(insertion) => { let docid = insertion.docid(); let content = insertion.new(); - extracted_documents_sender.insert(docid, content.boxed()).unwrap(); + extractor_sender.document_insert(docid, content.boxed()).unwrap(); + + // extracted_dictionary_sender.send(self, dictionary: &[u8]); } } Ok(()) as Result<_> @@ -102,7 +99,19 @@ where )?; /// TODO: manage the errors correctly - deladd_cbo_roaring_bitmap_sender.word_docids(merger).unwrap(); + extractor_sender.word_docids(merger).unwrap(); + + // word fid docids + let merger = WordFidDocidsExtractor::run_extraction( + index, + &global_fields_ids_map, + /// TODO: GrenadParameters::default() should be removed in favor a passed parameter + GrenadParameters::default(), + document_changes.clone(), + )?; + + /// TODO: manage the errors correctly + extractor_sender.word_fid_docids(merger).unwrap(); Ok(()) as Result<_> }) diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index b21f20b0f..c7f1a4385 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -79,6 +79,32 @@ pub fn merge_grenad_entries( let main_sender = sender.main(); main_sender.write_words_fst(&words_fst_mmap).unwrap(); } + MergerOperation::WordFidDocidsMerger(merger) => { + let word_docids_sender = sender.word_fid_docids(); + let database = index.word_fid_docids.remap_types::(); + + /// TODO manage the error correctly + let mut merger_iter = merger.into_stream_merger_iter().unwrap(); + + // TODO manage the error correctly + while let Some((key, deladd)) = merger_iter.next().unwrap() { + let current = database.get(rtxn, key)?; + let deladd: &KvReaderDelAdd = deladd.into(); + let del = deladd.get(DelAdd::Deletion); + let add = deladd.get(DelAdd::Addition); + + match merge_cbo_bitmaps(current, del, add)? { + Operation::Write(bitmap) => { + let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); + word_docids_sender.write(key, value).unwrap(); + } + Operation::Delete => { + word_docids_sender.delete(key).unwrap(); + } + Operation::Ignore => (), + } + } + } MergerOperation::InsertDocument { docid, document } => { documents_ids.insert(docid); sender.documents().uncompressed(docid, &document).unwrap(); From 98e48371c35da0b30716d946959c7fd50e7b284d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 4 Sep 2024 12:17:13 +0200 Subject: [PATCH 034/247] Factorize some stuff --- milli/src/update/new/channel.rs | 68 ++++++++------ milli/src/update/new/indexer/mod.rs | 56 +++++++++--- milli/src/update/new/merger.rs | 135 +++++++++++++++++----------- 3 files changed, 166 insertions(+), 93 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index acea02316..8888132e3 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -9,7 +9,7 @@ use super::StdResult; use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY}; use crate::update::new::KvReaderFieldId; use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{DocumentId, Index}; +use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// The capacity of the channel is currently in number of messages. pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) { @@ -103,7 +103,9 @@ pub struct WriterOperation { pub enum Database { WordDocids, + ExactWordDocids, WordFidDocids, + WordPositionDocids, Documents, Main, } @@ -114,7 +116,9 @@ impl WriterOperation { Database::Main => index.main.remap_types(), Database::Documents => index.documents.remap_types(), Database::WordDocids => index.word_docids.remap_types(), + Database::ExactWordDocids => index.exact_word_docids.remap_types(), Database::WordFidDocids => index.word_fid_docids.remap_types(), + Database::WordPositionDocids => index.word_position_docids.remap_types(), } } @@ -141,11 +145,7 @@ impl MergerSender { MainSender(&self.0) } - pub fn word_docids(&self) -> DocidsSender<'_, WordDocids> { - DocidsSender { sender: &self.0, _marker: PhantomData } - } - - pub fn word_fid_docids(&self) -> DocidsSender<'_, WordFidDocids> { + pub fn docids(&self) -> DocidsSender<'_, D> { DocidsSender { sender: &self.0, _marker: PhantomData } } @@ -187,21 +187,45 @@ impl MainSender<'_> { } pub enum WordDocids {} +pub enum ExactWordDocids {} pub enum WordFidDocids {} +pub enum WordPositionDocids {} pub trait DatabaseType { - fn database() -> Database; + const DATABASE: Database; + + fn new_merger_operation(merger: Merger) -> MergerOperation; } impl DatabaseType for WordDocids { - fn database() -> Database { - Database::WordDocids + const DATABASE: Database = Database::WordDocids; + + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::WordDocidsMerger(merger) + } +} + +impl DatabaseType for ExactWordDocids { + const DATABASE: Database = Database::ExactWordDocids; + + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::ExactWordDocidsMerger(merger) } } impl DatabaseType for WordFidDocids { - fn database() -> Database { - Database::WordFidDocids + const DATABASE: Database = Database::WordFidDocids; + + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::WordFidDocidsMerger(merger) + } +} + +impl DatabaseType for WordPositionDocids { + const DATABASE: Database = Database::WordPositionDocids; + + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::WordPositionDocidsMerger(merger) } } @@ -213,7 +237,7 @@ pub struct DocidsSender<'a, D> { impl DocidsSender<'_, D> { pub fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Write(KeyValueEntry::from_key_value(key, value)); - match self.sender.send(WriterOperation { database: D::database(), entry }) { + match self.sender.send(WriterOperation { database: D::DATABASE, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -221,7 +245,7 @@ impl DocidsSender<'_, D> { pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send(WriterOperation { database: D::database(), entry }) { + match self.sender.send(WriterOperation { database: D::DATABASE, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -258,7 +282,9 @@ impl DocumentsSender<'_> { pub enum MergerOperation { WordDocidsMerger(Merger), + ExactWordDocidsMerger(Merger), WordFidDocidsMerger(Merger), + WordPositionDocidsMerger(Merger), InsertDocument { docid: DocumentId, document: Box }, DeleteDocument { docid: DocumentId }, } @@ -295,23 +321,11 @@ impl ExtractorSender { } } - pub fn word_docids( + pub fn send_searchable( &self, merger: Merger, ) -> StdResult<(), SendError<()>> { - let operation = MergerOperation::WordDocidsMerger(merger); - match self.0.send(operation) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } - - pub fn word_fid_docids( - &self, - merger: Merger, - ) -> StdResult<(), SendError<()>> { - let operation = MergerOperation::WordFidDocidsMerger(merger); - match self.0.send(operation) { + match self.0.send(D::new_merger_operation(merger)) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 539b6d602..3b1fc97c5 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -11,14 +11,21 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::ThreadPool; pub use update_by_function::UpdateByFunction; -use super::channel::{extractors_merger_channels, merger_writer_channel, EntryOperation}; +use super::channel::{ + extractors_merger_channels, merger_writer_channel, EntryOperation, ExactWordDocids, WordDocids, + WordFidDocids, WordPositionDocids, +}; use super::document_change::DocumentChange; -use super::extract::{SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor}; +use super::extract::{ + ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, + WordPositionDocidsExtractor, +}; use super::merger::merge_grenad_entries; use super::StdResult; use crate::documents::{ obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY, }; +use crate::update::new::channel::{DatabaseType, ExtractorSender}; use crate::update::GrenadParameters; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; @@ -82,36 +89,43 @@ where let docid = insertion.docid(); let content = insertion.new(); extractor_sender.document_insert(docid, content.boxed()).unwrap(); - // extracted_dictionary_sender.send(self, dictionary: &[u8]); } } Ok(()) as Result<_> })?; - // word docids - let merger = WordDocidsExtractor::run_extraction( + extract_and_send_docids::( index, &global_fields_ids_map, - /// TODO: GrenadParameters::default() should be removed in favor a passed parameter GrenadParameters::default(), document_changes.clone(), + &extractor_sender, )?; - /// TODO: manage the errors correctly - extractor_sender.word_docids(merger).unwrap(); - - // word fid docids - let merger = WordFidDocidsExtractor::run_extraction( + extract_and_send_docids::( index, &global_fields_ids_map, - /// TODO: GrenadParameters::default() should be removed in favor a passed parameter GrenadParameters::default(), document_changes.clone(), + &extractor_sender, )?; - /// TODO: manage the errors correctly - extractor_sender.word_fid_docids(merger).unwrap(); + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; Ok(()) as Result<_> }) @@ -148,6 +162,20 @@ where Ok(()) } +/// TODO: GrenadParameters::default() should be removed in favor a passed parameter +/// TODO: manage the errors correctly +/// TODO: we must have a single trait that also gives the extractor type +fn extract_and_send_docids( + index: &Index, + fields_ids_map: &GlobalFieldsIdsMap, + indexer: GrenadParameters, + document_changes: impl IntoParallelIterator>, + sender: &ExtractorSender, +) -> Result<()> { + let merger = E::run_extraction(index, fields_ids_map, indexer, document_changes)?; + Ok(sender.send_searchable::(merger).unwrap()) +} + /// TODO move this elsewhere pub fn guess_primary_key<'a>( rtxn: &'a RoTxn<'a>, diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index c7f1a4385..976fe435f 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -1,16 +1,24 @@ +use std::fs::File; +use std::io; + use fst::set::OpBuilder; use fst::{Set, SetBuilder}; +use grenad::Merger; use heed::types::Bytes; -use heed::RoTxn; +use heed::{Database, RoTxn}; use memmap2::Mmap; use roaring::RoaringBitmap; use tempfile::tempfile; -use super::channel::{MergerReceiver, MergerSender}; +use super::channel::{ + DatabaseType, DocidsSender, ExactWordDocids, MergerReceiver, MergerSender, WordDocids, + WordFidDocids, WordPositionDocids, +}; use super::KvReaderDelAdd; use crate::index::main_key::WORDS_FST_KEY; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; +use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{CboRoaringBitmapCodec, Index, Result}; /// TODO We must return some infos/stats @@ -26,34 +34,18 @@ pub fn merge_grenad_entries( for merger_operation in receiver { match merger_operation { MergerOperation::WordDocidsMerger(merger) => { - let word_docids_sender = sender.word_docids(); - let database = index.word_docids.remap_types::(); let mut add_words_fst = SetBuilder::new(tempfile()?)?; let mut del_words_fst = SetBuilder::new(tempfile()?)?; - /// TODO manage the error correctly - let mut merger_iter = merger.into_stream_merger_iter().unwrap(); - - // TODO manage the error correctly - while let Some((key, deladd)) = merger_iter.next().unwrap() { - let current = database.get(rtxn, key)?; - let deladd: &KvReaderDelAdd = deladd.into(); - let del = deladd.get(DelAdd::Deletion); - let add = deladd.get(DelAdd::Addition); - - match merge_cbo_bitmaps(current, del, add)? { - Operation::Write(bitmap) => { - let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); - word_docids_sender.write(key, value).unwrap(); - add_words_fst.insert(key)?; - } - Operation::Delete => { - word_docids_sender.delete(key).unwrap(); - del_words_fst.insert(key)?; - } - Operation::Ignore => (), - } - } + merge_and_send_docids( + merger, + index.word_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |key| add_words_fst.insert(key), + |key| del_words_fst.insert(key), + )?; // Move that into a dedicated function let words_fst = index.words_fst(rtxn)?; @@ -66,7 +58,6 @@ pub fn merge_grenad_entries( let del_words_fst_mmap = unsafe { Mmap::map(&del_words_fst_file)? }; let del_words_fst = Set::new(&del_words_fst_mmap)?; - // TO BE IMPROVED @many let diff = words_fst.op().add(&del_words_fst).difference(); let stream = add_words_fst.op().add(diff).union(); @@ -79,31 +70,38 @@ pub fn merge_grenad_entries( let main_sender = sender.main(); main_sender.write_words_fst(&words_fst_mmap).unwrap(); } + MergerOperation::ExactWordDocidsMerger(merger) => { + merge_and_send_docids( + merger, + index.exact_word_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |_key| Ok(()), + |_key| Ok(()), + )?; + } MergerOperation::WordFidDocidsMerger(merger) => { - let word_docids_sender = sender.word_fid_docids(); - let database = index.word_fid_docids.remap_types::(); - - /// TODO manage the error correctly - let mut merger_iter = merger.into_stream_merger_iter().unwrap(); - - // TODO manage the error correctly - while let Some((key, deladd)) = merger_iter.next().unwrap() { - let current = database.get(rtxn, key)?; - let deladd: &KvReaderDelAdd = deladd.into(); - let del = deladd.get(DelAdd::Deletion); - let add = deladd.get(DelAdd::Addition); - - match merge_cbo_bitmaps(current, del, add)? { - Operation::Write(bitmap) => { - let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); - word_docids_sender.write(key, value).unwrap(); - } - Operation::Delete => { - word_docids_sender.delete(key).unwrap(); - } - Operation::Ignore => (), - } - } + merge_and_send_docids( + merger, + index.word_fid_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |_key| Ok(()), + |_key| Ok(()), + )?; + } + MergerOperation::WordPositionDocidsMerger(merger) => { + merge_and_send_docids( + merger, + index.word_position_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |_key| Ok(()), + |_key| Ok(()), + )?; } MergerOperation::InsertDocument { docid, document } => { documents_ids.insert(docid); @@ -128,6 +126,39 @@ pub fn merge_grenad_entries( Ok(()) } +fn merge_and_send_docids( + merger: Merger, + database: Database, + rtxn: &RoTxn<'_>, + buffer: &mut Vec, + word_docids_sender: DocidsSender<'_, D>, + mut add_key: impl FnMut(&[u8]) -> fst::Result<()>, + mut del_key: impl FnMut(&[u8]) -> fst::Result<()>, +) -> Result<()> { + let mut merger_iter = merger.into_stream_merger_iter().unwrap(); + while let Some((key, deladd)) = merger_iter.next().unwrap() { + let current = database.get(rtxn, key)?; + let deladd: &KvReaderDelAdd = deladd.into(); + let del = deladd.get(DelAdd::Deletion); + let add = deladd.get(DelAdd::Addition); + + match merge_cbo_bitmaps(current, del, add)? { + Operation::Write(bitmap) => { + let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer); + word_docids_sender.write(key, value).unwrap(); + add_key(key)?; + } + Operation::Delete => { + word_docids_sender.delete(key).unwrap(); + del_key(key)?; + } + Operation::Ignore => (), + } + } + + Ok(()) +} + enum Operation { Write(RoaringBitmap), Delete, From 1d59c19cd2fa06fa5fd822acbaa4a352f9ad219f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 4 Sep 2024 14:30:09 +0200 Subject: [PATCH 035/247] Send the WordsFst by using an Mmap --- milli/src/update/new/channel.rs | 40 ++++++++++++++++++---------- milli/src/update/new/merger.rs | 46 +++++++++++++++++++-------------- 2 files changed, 52 insertions(+), 34 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 8888132e3..e9a795bf5 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -4,12 +4,13 @@ use std::marker::PhantomData; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use grenad::Merger; use heed::types::Bytes; +use memmap2::Mmap; use super::StdResult; use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY}; use crate::update::new::KvReaderFieldId; use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{CboRoaringBitmapCodec, DocumentId, Index}; +use crate::{DocumentId, Index}; /// The capacity of the channel is currently in number of messages. pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) { @@ -23,26 +24,35 @@ pub fn extractors_merger_channels(cap: usize) -> (ExtractorSender, MergerReceive (ExtractorSender(sender), MergerReceiver(receiver)) } -pub struct KeyValueEntry { - key_length: usize, - data: Box<[u8]>, +pub enum KeyValueEntry { + SmallInMemory { key_length: usize, data: Box<[u8]> }, + LargeOnDisk { key: Box<[u8]>, value: Mmap }, } impl KeyValueEntry { - pub fn from_key_value(key: &[u8], value: &[u8]) -> Self { + pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self { let mut data = Vec::with_capacity(key.len() + value.len()); data.extend_from_slice(key); data.extend_from_slice(value); + KeyValueEntry::SmallInMemory { key_length: key.len(), data: data.into_boxed_slice() } + } - KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() } + pub fn from_large_key_value(key: &[u8], value: Mmap) -> Self { + KeyValueEntry::LargeOnDisk { key: key.to_vec().into_boxed_slice(), value } } pub fn key(&self) -> &[u8] { - &self.data.as_ref()[..self.key_length] + match self { + KeyValueEntry::SmallInMemory { key_length, data } => &data.as_ref()[..*key_length], + KeyValueEntry::LargeOnDisk { key, value: _ } => key.as_ref(), + } } pub fn value(&self) -> &[u8] { - &self.data.as_ref()[self.key_length..] + match self { + KeyValueEntry::SmallInMemory { key_length, data } => &data.as_ref()[*key_length..], + KeyValueEntry::LargeOnDisk { key: _, value } => value.as_ref(), + } } } @@ -154,7 +164,7 @@ impl MergerSender { } pub fn send_documents_ids(&self, bitmap: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_key_value( + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( DOCUMENTS_IDS_KEY.as_bytes(), bitmap, )); @@ -168,9 +178,11 @@ impl MergerSender { pub struct MainSender<'a>(&'a Sender); impl MainSender<'_> { - pub fn write_words_fst(&self, value: &[u8]) -> StdResult<(), SendError<()>> { - let entry = - EntryOperation::Write(KeyValueEntry::from_key_value(WORDS_FST_KEY.as_bytes(), value)); + pub fn write_words_fst(&self, value: Mmap) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value( + WORDS_FST_KEY.as_bytes(), + value, + )); match self.0.send(WriterOperation { database: Database::Main, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), @@ -236,7 +248,7 @@ pub struct DocidsSender<'a, D> { impl DocidsSender<'_, D> { pub fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_key_value(key, value)); + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); match self.sender.send(WriterOperation { database: D::DATABASE, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), @@ -261,7 +273,7 @@ impl DocumentsSender<'_> { docid: DocumentId, document: &KvReaderFieldId, ) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_key_value( + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( &docid.to_be_bytes(), document.as_bytes(), )); diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 976fe435f..35449b475 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -49,26 +49,8 @@ pub fn merge_grenad_entries( // Move that into a dedicated function let words_fst = index.words_fst(rtxn)?; - - let add_words_fst_file = add_words_fst.into_inner()?; - let add_words_fst_mmap = unsafe { Mmap::map(&add_words_fst_file)? }; - let add_words_fst = Set::new(&add_words_fst_mmap)?; - - let del_words_fst_file = del_words_fst.into_inner()?; - let del_words_fst_mmap = unsafe { Mmap::map(&del_words_fst_file)? }; - let del_words_fst = Set::new(&del_words_fst_mmap)?; - - let diff = words_fst.op().add(&del_words_fst).difference(); - let stream = add_words_fst.op().add(diff).union(); - - let mut words_fst = SetBuilder::new(tempfile()?)?; - words_fst.extend_stream(stream)?; - let words_fst_file = words_fst.into_inner()?; - let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; - - // PLEASE SEND THIS AS AN MMAP - let main_sender = sender.main(); - main_sender.write_words_fst(&words_fst_mmap).unwrap(); + let mmap = compute_new_words_fst(add_words_fst, del_words_fst, words_fst)?; + sender.main().write_words_fst(mmap).unwrap(); } MergerOperation::ExactWordDocidsMerger(merger) => { merge_and_send_docids( @@ -126,6 +108,30 @@ pub fn merge_grenad_entries( Ok(()) } +fn compute_new_words_fst( + add_words_fst: SetBuilder, + del_words_fst: SetBuilder, + words_fst: Set>, +) -> Result { + let add_words_fst_file = add_words_fst.into_inner()?; + let add_words_fst_mmap = unsafe { Mmap::map(&add_words_fst_file)? }; + let add_words_fst = Set::new(&add_words_fst_mmap)?; + + let del_words_fst_file = del_words_fst.into_inner()?; + let del_words_fst_mmap = unsafe { Mmap::map(&del_words_fst_file)? }; + let del_words_fst = Set::new(&del_words_fst_mmap)?; + + let diff = words_fst.op().add(&del_words_fst).difference(); + let stream = add_words_fst.op().add(diff).union(); + + let mut words_fst = SetBuilder::new(tempfile()?)?; + words_fst.extend_stream(stream)?; + let words_fst_file = words_fst.into_inner()?; + let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; + + Ok(words_fst_mmap) +} + fn merge_and_send_docids( merger: Merger, database: Database, From 19d937ab2191d8d8b796e28a051085de8d6391b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 4 Sep 2024 17:03:09 +0200 Subject: [PATCH 036/247] Introduce the facet extractors --- milli/src/update/new/document_change.rs | 1 + milli/src/update/new/extract/cache.rs | 12 +- milli/src/update/new/extract/faceted/mod.rs | 271 ++++++++++++++++++ milli/src/update/new/extract/mod.rs | 107 +++++++ .../extract/searchable/extract_word_docids.rs | 1 + .../src/update/new/extract/searchable/mod.rs | 24 +- .../extract/searchable/tokenize_document.rs | 148 ++-------- milli/src/update/new/indexer/mod.rs | 13 + milli/src/update/new/merger.rs | 5 +- 9 files changed, 443 insertions(+), 139 deletions(-) create mode 100644 milli/src/update/new/extract/faceted/mod.rs diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index b4eb4d1d2..aa37593c9 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -52,6 +52,7 @@ impl Deletion { self.docid } + // TODO shouldn't we use the one in self? pub fn current<'a>( &self, rtxn: &'a RoTxn, diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 684b67daa..5c3c4a735 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -1,5 +1,5 @@ -use std::num::NonZeroUsize; use std::mem; +use std::num::NonZeroUsize; use grenad::{MergeFunction, Sorter}; use lru::LruCache; @@ -10,16 +10,16 @@ use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::CboRoaringBitmapCodec; #[derive(Debug)] -pub struct CachedSorter { +pub struct CboCachedSorter { cache: lru::LruCache, DelAddRoaringBitmap>, sorter: Sorter, deladd_buffer: Vec, cbo_buffer: Vec, } -impl CachedSorter { +impl CboCachedSorter { pub fn new(cap: NonZeroUsize, sorter: Sorter) -> Self { - CachedSorter { + CboCachedSorter { cache: lru::LruCache::new(cap), sorter, deladd_buffer: Vec::new(), @@ -28,7 +28,7 @@ impl CachedSorter { } } -impl CachedSorter { +impl CboCachedSorter { pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del, add: _ }) => { @@ -194,4 +194,4 @@ impl DelAddRoaringBitmap { fn new_add_u32(n: u32) -> Self { DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) } } -} \ No newline at end of file +} diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs new file mode 100644 index 000000000..b230549c1 --- /dev/null +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -0,0 +1,271 @@ +use std::collections::HashSet; +use std::fs::File; + +use grenad::Merger; +use heed::RoTxn; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use serde_json::Value; + +use super::cache::CboCachedSorter; +use super::perm_json_p; +use crate::facet::value_encoding::f64_into_bytes; +use crate::update::new::{DocumentChange, ItemsPool, KvReaderFieldId}; +use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::{ + normalize_facet, FieldId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError, + MAX_FACET_VALUE_LENGTH, +}; + +pub trait FacetedExtractor { + fn run_extraction( + index: &Index, + fields_ids_map: &GlobalFieldsIdsMap, + indexer: GrenadParameters, + document_changes: impl IntoParallelIterator>, + ) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let rtxn = index.read_txn()?; + let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; + let attributes_to_extract: Vec<_> = + attributes_to_extract.iter().map(|s| s.as_ref()).collect(); + + let context_pool = ItemsPool::new(|| { + Ok(( + index.read_txn()?, + fields_ids_map.clone(), + Vec::new(), + CboCachedSorter::new( + // TODO use a better value + 100.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + ), + )) + }); + + document_changes.into_par_iter().try_for_each(|document_change| { + context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + buffer, + fields_ids_map, + &attributes_to_extract, + cached_sorter, + document_change?, + ) + }) + })?; + + let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); + for (_rtxn, _fields_ids_map, _buffer, cache) in context_pool.into_items() { + let sorter = cache.into_sorter()?; + let readers = sorter.into_reader_cursors()?; + builder.extend(readers); + } + + Ok(builder.build()) + } + + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + buffer: &mut Vec, + fields_ids_map: &mut GlobalFieldsIdsMap, + attributes_to_extract: &[&str], + cached_sorter: &mut CboCachedSorter, + document_change: DocumentChange, + ) -> Result<()> { + match document_change { + DocumentChange::Deletion(inner) => { + let mut facet_del_fn = |fid, value: &Value| -> Result<()> { + buffer.clear(); + match Self::build_key(fid, value, buffer) { + // TODO manage errors + Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()), + None => Ok(()), + } + }; + + extract_document_facets( + attributes_to_extract, + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut facet_del_fn, + ) + } + DocumentChange::Update(inner) => { + let mut facet_del_fn = |fid, value: &Value| -> Result<()> { + buffer.clear(); + match Self::build_key(fid, value, buffer) { + // TODO manage errors + Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()), + None => Ok(()), + } + }; + + extract_document_facets( + attributes_to_extract, + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut facet_del_fn, + )?; + + let mut facet_add_fn = |fid, value: &Value| -> Result<()> { + buffer.clear(); + match Self::build_key(fid, value, buffer) { + // TODO manage errors + Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()), + None => Ok(()), + } + }; + + extract_document_facets( + attributes_to_extract, + inner.new(), + fields_ids_map, + &mut facet_add_fn, + ) + } + DocumentChange::Insertion(inner) => { + let mut facet_add_fn = |fid, value: &Value| -> Result<()> { + buffer.clear(); + match Self::build_key(fid, value, buffer) { + // TODO manage errors + Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()), + None => Ok(()), + } + }; + + extract_document_facets( + attributes_to_extract, + inner.new(), + fields_ids_map, + &mut facet_add_fn, + ) + } + } + } + + // TODO avoid owning the strings here. + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; + + fn build_key<'b>(field_id: FieldId, value: &Value, output: &'b mut Vec) + -> Option<&'b [u8]>; +} + +pub struct FieldIdFacetNumberDocidsExtractor; +impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let number = value.as_number()?; + let n = number.as_f64()?; + let ordered = f64_into_bytes(n)?; + + // fid - level - orderedf64 - orignalf64 + output.extend_from_slice(&field_id.to_be_bytes()); + output.push(1); // level 0 + output.extend_from_slice(&ordered); + output.extend_from_slice(&n.to_be_bytes()); + + Some(&*output) + } +} + +/// TODO It doesn't keep the original string in the value +pub struct FieldIdFacetStringDocidsExtractor; +impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let string = value.as_str()?; + let normalize = normalize_facet(string); + let truncated = truncate_str(&normalize); + + // fid - level - normalized string + output.extend_from_slice(&field_id.to_be_bytes()); + output.push(1); // level 0 + output.extend_from_slice(truncated.as_bytes()); + + Some(&*output) + } +} + +pub fn extract_document_facets( + attributes_to_extract: &[&str], + obkv: &KvReaderFieldId, + field_id_map: &mut GlobalFieldsIdsMap, + facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>, +) -> Result<()> { + let mut field_name = String::new(); + for (field_id, field_bytes) in obkv { + let Some(field_name) = field_id_map.name(field_id).map(|s| { + field_name.clear(); + field_name.push_str(s); + &field_name + }) else { + unreachable!("field id not found in field id map"); + }; + + let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) { + Some(field_id) => facet_fn(field_id, value), + None => Err(UserError::AttributeLimitReached.into()), + }; + + // if the current field is searchable or contains a searchable attribute + if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) { + // parse json. + match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { + Value::Object(object) => perm_json_p::seek_leaf_values_in_object( + &object, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + &mut tokenize_field, + )?, + Value::Array(array) => perm_json_p::seek_leaf_values_in_array( + &array, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + &mut tokenize_field, + )?, + value => tokenize_field(field_name, &value)?, + } + } + } + + Ok(()) +} + +/// Truncates a string to the biggest valid LMDB key size. +fn truncate_str(s: &str) -> &str { + let index = s + .char_indices() + .map(|(idx, _)| idx) + .chain(std::iter::once(s.len())) + .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH) + .last(); + + &s[..index.unwrap_or(0)] +} diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 1964b88fc..fee4f42f6 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -1,7 +1,114 @@ mod cache; +mod faceted; mod searchable; +pub use faceted::FacetedExtractor; pub use searchable::{ ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, WordPositionDocidsExtractor, }; + +/// TODO move in permissive json pointer +pub mod perm_json_p { + use serde_json::{Map, Value}; + + use crate::Result; + const SPLIT_SYMBOL: char = '.'; + + /// Returns `true` if the `selector` match the `key`. + /// + /// ```text + /// Example: + /// `animaux` match `animaux` + /// `animaux.chien` match `animaux` + /// `animaux.chien` match `animaux` + /// `animaux.chien.nom` match `animaux` + /// `animaux.chien.nom` match `animaux.chien` + /// ----------------------------------------- + /// `animaux` doesn't match `animaux.chien` + /// `animaux.` doesn't match `animaux` + /// `animaux.ch` doesn't match `animaux.chien` + /// `animau` doesn't match `animaux` + /// ``` + pub fn contained_in(selector: &str, key: &str) -> bool { + selector.starts_with(key) + && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) + } + + pub fn seek_leaf_values_in_object( + value: &Map, + selectors: Option<&[&str]>, + skip_selectors: &[&str], + base_key: &str, + seeker: &mut impl FnMut(&str, &Value) -> Result<()>, + ) -> Result<()> { + for (key, value) in value.iter() { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, SPLIT_SYMBOL, key) + }; + + // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` + // so we check the contained_in on both side + let should_continue = select_field(&base_key, selectors, skip_selectors); + if should_continue { + match value { + Value::Object(object) => seek_leaf_values_in_object( + object, + selectors, + skip_selectors, + &base_key, + seeker, + ), + Value::Array(array) => seek_leaf_values_in_array( + array, + selectors, + skip_selectors, + &base_key, + seeker, + ), + value => seeker(&base_key, value), + }?; + } + } + + Ok(()) + } + + pub fn seek_leaf_values_in_array( + values: &[Value], + selectors: Option<&[&str]>, + skip_selectors: &[&str], + base_key: &str, + seeker: &mut impl FnMut(&str, &Value) -> Result<()>, + ) -> Result<()> { + for value in values { + match value { + Value::Object(object) => { + seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker) + } + Value::Array(array) => { + seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker) + } + value => seeker(base_key, value), + }?; + } + + Ok(()) + } + + pub fn select_field( + field_name: &str, + selectors: Option<&[&str]>, + skip_selectors: &[&str], + ) -> bool { + selectors.map_or(true, |selectors| { + selectors.iter().any(|selector| { + contained_in(selector, &field_name) || contained_in(&field_name, selector) + }) + }) && !skip_selectors.iter().any(|skip_selector| { + contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector) + }) + } +} diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index f8b495538..70f9c4e47 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -19,6 +19,7 @@ impl SearchableExtractor for WordDocidsExtractor { index.exact_attributes(rtxn).map_err(Into::into) } + /// TODO write in an external Vec buffer fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { Cow::Borrowed(word.as_bytes()) } diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 106455a7b..078d06150 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -1,22 +1,22 @@ mod extract_word_docids; mod tokenize_document; +use std::borrow::Cow; +use std::fs::File; + pub use extract_word_docids::{ ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor, WordPositionDocidsExtractor, }; -use std::borrow::Cow; -use std::fs::File; - use grenad::Merger; use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use super::cache::CachedSorter; +use super::cache::CboCachedSorter; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; -use tokenize_document::{tokenizer_builder, DocumentTokenizer}; pub trait SearchableExtractor { fn run_extraction( @@ -60,7 +60,7 @@ pub trait SearchableExtractor { index.read_txn()?, &document_tokenizer, fields_ids_map.clone(), - CachedSorter::new( + CboCachedSorter::new( // TODO use a better value 100.try_into().unwrap(), create_sorter( @@ -103,14 +103,16 @@ pub trait SearchableExtractor { index: &Index, document_tokenizer: &DocumentTokenizer, fields_ids_map: &mut GlobalFieldsIdsMap, - cached_sorter: &mut CachedSorter, + cached_sorter: &mut CboCachedSorter, document_change: DocumentChange, ) -> Result<()> { match document_change { DocumentChange::Deletion(inner) => { let mut token_fn = |fid, pos: u16, word: &str| { let key = Self::build_key(fid, pos, word); + /// TODO manage the error cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); + Ok(()) }; document_tokenizer.tokenize_document( inner.current(rtxn, index)?.unwrap(), @@ -121,7 +123,9 @@ pub trait SearchableExtractor { DocumentChange::Update(inner) => { let mut token_fn = |fid, pos, word: &str| { let key = Self::build_key(fid, pos, word); + /// TODO manage the error cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); + Ok(()) }; document_tokenizer.tokenize_document( inner.current(rtxn, index)?.unwrap(), @@ -131,14 +135,18 @@ pub trait SearchableExtractor { let mut token_fn = |fid, pos, word: &str| { let key = Self::build_key(fid, pos, word); + /// TODO manage the error cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); + Ok(()) }; document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; } DocumentChange::Insertion(inner) => { let mut token_fn = |fid, pos, word: &str| { let key = Self::build_key(fid, pos, word); + /// TODO manage the error cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); + Ok(()) }; document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; } @@ -152,5 +160,5 @@ pub trait SearchableExtractor { fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; - fn build_key<'a>(field_id: FieldId, position: u16, word: &'a str) -> Cow<'a, [u8]>; + fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>; } diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index e20e52406..1d19354db 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -1,13 +1,15 @@ use std::collections::HashMap; use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; -use heed::RoTxn; use serde_json::Value; +use crate::update::new::extract::perm_json_p::{ + seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, +}; use crate::update::new::KvReaderFieldId; use crate::{ - FieldId, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule, - Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, + FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError, + MAX_WORD_LENGTH, }; pub struct DocumentTokenizer<'a> { @@ -23,7 +25,7 @@ impl<'a> DocumentTokenizer<'a> { &self, obkv: &KvReaderFieldId, field_id_map: &mut GlobalFieldsIdsMap, - token_fn: &mut impl FnMut(FieldId, u16, &str), + token_fn: &mut impl FnMut(FieldId, u16, &str) -> Result<()>, ) -> Result<()> { let mut field_position = HashMap::new(); let mut field_name = String::new(); @@ -38,22 +40,23 @@ impl<'a> DocumentTokenizer<'a> { let mut tokenize_field = |name: &str, value: &Value| { let Some(field_id) = field_id_map.id_or_insert(name) else { - /// TODO: better error - panic!("it's over 9000"); + return Err(UserError::AttributeLimitReached.into()); }; let position = field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0); if *position as u32 >= self.max_positions_per_attributes { - return; + return Ok(()); } match value { Value::Number(n) => { let token = n.to_string(); if let Ok(position) = (*position).try_into() { - token_fn(field_id, position, token.as_str()); + token_fn(field_id, position, token.as_str())?; } + + Ok(()) } Value::String(text) => { // create an iterator of token with their positions. @@ -74,41 +77,40 @@ impl<'a> DocumentTokenizer<'a> { if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { *position = index; if let Ok(position) = (*position).try_into() { - token_fn(field_id, position, token); + token_fn(field_id, position, token)?; } } } + + Ok(()) } - _ => (), + _ => Ok(()), } }; // if the current field is searchable or contains a searchable attribute - if perm_json_p::select_field( - &field_name, - self.attribute_to_extract.as_deref(), - self.attribute_to_skip, - ) { + if select_field(&field_name, self.attribute_to_extract, self.attribute_to_skip) { // parse json. match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { - Value::Object(object) => perm_json_p::seek_leaf_values_in_object( + Value::Object(object) => seek_leaf_values_in_object( &object, - self.attribute_to_extract.as_deref(), + self.attribute_to_extract, self.attribute_to_skip, &field_name, &mut tokenize_field, - ), - Value::Array(array) => perm_json_p::seek_leaf_values_in_array( + )?, + Value::Array(array) => seek_leaf_values_in_array( &array, - self.attribute_to_extract.as_deref(), + self.attribute_to_extract, self.attribute_to_skip, &field_name, &mut tokenize_field, - ), - value => tokenize_field(&field_name, &value), + )?, + value => tokenize_field(&field_name, &value)?, } } } + Ok(()) } } @@ -167,105 +169,6 @@ pub fn tokenizer_builder<'a>( tokenizer_builder } -/// TODO move in permissive json pointer -mod perm_json_p { - use serde_json::{Map, Value}; - const SPLIT_SYMBOL: char = '.'; - - /// Returns `true` if the `selector` match the `key`. - /// - /// ```text - /// Example: - /// `animaux` match `animaux` - /// `animaux.chien` match `animaux` - /// `animaux.chien` match `animaux` - /// `animaux.chien.nom` match `animaux` - /// `animaux.chien.nom` match `animaux.chien` - /// ----------------------------------------- - /// `animaux` doesn't match `animaux.chien` - /// `animaux.` doesn't match `animaux` - /// `animaux.ch` doesn't match `animaux.chien` - /// `animau` doesn't match `animaux` - /// ``` - pub fn contained_in(selector: &str, key: &str) -> bool { - selector.starts_with(key) - && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) - } - - pub fn seek_leaf_values_in_object( - value: &Map, - selectors: Option<&[&str]>, - skip_selectors: &[&str], - base_key: &str, - seeker: &mut impl FnMut(&str, &Value), - ) { - for (key, value) in value.iter() { - let base_key = if base_key.is_empty() { - key.to_string() - } else { - format!("{}{}{}", base_key, SPLIT_SYMBOL, key) - }; - - // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` - // so we check the contained_in on both side - let should_continue = select_field(&base_key, selectors, skip_selectors); - if should_continue { - match value { - Value::Object(object) => seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - &base_key, - seeker, - ), - Value::Array(array) => seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - &base_key, - seeker, - ), - value => seeker(&base_key, value), - } - } - } - } - - pub fn seek_leaf_values_in_array( - values: &[Value], - selectors: Option<&[&str]>, - skip_selectors: &[&str], - base_key: &str, - seeker: &mut impl FnMut(&str, &Value), - ) { - for value in values { - match value { - Value::Object(object) => { - seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker) - } - Value::Array(array) => { - seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker) - } - value => seeker(base_key, value), - } - } - } - - pub fn select_field( - field_name: &str, - selectors: Option<&[&str]>, - skip_selectors: &[&str], - ) -> bool { - selectors.map_or(true, |selectors| { - selectors.iter().any(|selector| { - contained_in(selector, &field_name) || contained_in(&field_name, selector) - }) - }) && !skip_selectors.iter().any(|skip_selector| { - contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector) - }) - } -} - #[cfg(test)] mod test { use charabia::TokenizerBuilder; @@ -274,6 +177,8 @@ mod test { use serde_json::json; use super::*; + use crate::FieldsIdsMap; + #[test] fn test_tokenize_document() { let mut fields_ids_map = FieldsIdsMap::new(); @@ -329,6 +234,7 @@ mod test { document_tokenizer .tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| { words.insert([fid, pos], word.to_string()); + Ok(()) }) .unwrap(); diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 3b1fc97c5..21e28fc84 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -127,6 +127,19 @@ where &extractor_sender, )?; + // TODO THIS IS TOO MUCH + // Extract fieldid docid facet number + // Extract fieldid docid facet string + // Extract facetid string fst + + // Extract fieldid facet isempty docids + // Extract fieldid facet isnull docids + // Extract fieldid facet exists docids + + // TODO This is the normal system + // Extract fieldid facet number docids + // Extract fieldid facet string docids + Ok(()) as Result<_> }) })?; diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 35449b475..25f09441c 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -1,7 +1,5 @@ use std::fs::File; -use std::io; -use fst::set::OpBuilder; use fst::{Set, SetBuilder}; use grenad::Merger; use heed::types::Bytes; @@ -15,7 +13,6 @@ use super::channel::{ WordFidDocids, WordPositionDocids, }; use super::KvReaderDelAdd; -use crate::index::main_key::WORDS_FST_KEY; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::update::MergeDeladdCboRoaringBitmaps; @@ -210,7 +207,7 @@ fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec } /// TODO Return the slice directly from the serialize_into method -fn serialize_bitmap_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) { +fn serialize_bitmap_into_vec(bitmap: &RoaringBitmap, buffer: &mut Vec) { buffer.clear(); bitmap.serialize_into(buffer).unwrap(); // buffer.as_slice() From 0b061f1e7090e62d1f7b52bd434ac389831ecf7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 4 Sep 2024 17:40:24 +0200 Subject: [PATCH 037/247] Introduce the FieldIdFacetIsEmptyDocidsExtractor --- milli/src/update/new/extract/faceted/mod.rs | 32 ++++++++++++++++++++- milli/src/update/new/indexer/mod.rs | 2 ++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index b230549c1..e3c89b0e4 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -186,7 +186,6 @@ impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor { } } -/// TODO It doesn't keep the original string in the value pub struct FieldIdFacetStringDocidsExtractor; impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { @@ -211,6 +210,37 @@ impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { } } +// Extract fieldid facet isempty docids +// Extract fieldid facet isnull docids +// Extract fieldid facet exists docids + +pub struct FieldIdFacetIsEmptyDocidsExtractor; +impl FacetedExtractor for FieldIdFacetIsEmptyDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let is_empty = match value { + Value::Null | Value::Bool(_) | Value::Number(_) => false, + Value::String(s) => s.is_empty(), + Value::Array(a) => a.is_empty(), + Value::Object(o) => o.is_empty(), + }; + + if is_empty { + output.extend_from_slice(&field_id.to_be_bytes()); + Some(&*output) + } else { + None + } + } +} + pub fn extract_document_facets( attributes_to_extract: &[&str], obkv: &KvReaderFieldId, diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 21e28fc84..ed42c03b1 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -131,7 +131,9 @@ where // Extract fieldid docid facet number // Extract fieldid docid facet string // Extract facetid string fst + // Extract facetid normalized string strings + // TODO Inverted Indexes again // Extract fieldid facet isempty docids // Extract fieldid facet isnull docids // Extract fieldid facet exists docids From 9c0a1cd9fdbad815937d5835720c6b3fd20009fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 4 Sep 2024 17:48:49 +0200 Subject: [PATCH 038/247] Introduce the FieldIdFacetExistsDocidsExtractor --- milli/src/update/new/extract/faceted/mod.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index e3c89b0e4..c885fd610 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -210,9 +210,23 @@ impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { } } -// Extract fieldid facet isempty docids // Extract fieldid facet isnull docids -// Extract fieldid facet exists docids + +pub struct FieldIdFacetExistsDocidsExtractor; +impl FacetedExtractor for FieldIdFacetExistsDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + _value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + output.extend_from_slice(&field_id.to_be_bytes()); + Some(&*output) + } +} pub struct FieldIdFacetIsEmptyDocidsExtractor; impl FacetedExtractor for FieldIdFacetIsEmptyDocidsExtractor { From b33ec9ba3ff098ea08603bc262b6f1d0578e210a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 4 Sep 2024 17:50:08 +0200 Subject: [PATCH 039/247] Introduce the FieldIdFacetIsNullDocidsExtractor --- milli/src/update/new/extract/faceted/mod.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index c885fd610..b54219fd3 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -210,7 +210,25 @@ impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { } } -// Extract fieldid facet isnull docids +pub struct FieldIdFacetIsNullDocidsExtractor; +impl FacetedExtractor for FieldIdFacetIsNullDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + if value.is_null() { + output.extend_from_slice(&field_id.to_be_bytes()); + Some(&*output) + } else { + None + } + } +} pub struct FieldIdFacetExistsDocidsExtractor; impl FacetedExtractor for FieldIdFacetExistsDocidsExtractor { From 27308eaab175b040be55892b43bf240535686877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 4 Sep 2024 17:58:15 +0200 Subject: [PATCH 040/247] Import the facet extractors --- milli/src/update/new/extract/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index fee4f42f6..69081e251 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -2,7 +2,11 @@ mod cache; mod faceted; mod searchable; -pub use faceted::FacetedExtractor; +pub use faceted::{ + FacetedExtractor, FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor, + FieldIdFacetIsNullDocidsExtractor, FieldIdFacetNumberDocidsExtractor, + FieldIdFacetStringDocidsExtractor, +}; pub use searchable::{ ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, WordPositionDocidsExtractor, From 34f11e33808d5e7413bf0752b5dde028a9447a04 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 5 Sep 2024 10:30:39 +0200 Subject: [PATCH 041/247] Implement word count and word pair proximity extractors --- .../extract_fid_word_count_docids.rs | 135 +++++++++++++ .../extract_word_pair_proximity_docids.rs | 182 ++++++++++++++++++ .../src/update/new/extract/searchable/mod.rs | 4 + .../extract/searchable/tokenize_document.rs | 17 +- 4 files changed, 331 insertions(+), 7 deletions(-) create mode 100644 milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs create mode 100644 milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs diff --git a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs new file mode 100644 index 000000000..08160155e --- /dev/null +++ b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs @@ -0,0 +1,135 @@ +use std::{borrow::Cow, collections::HashMap}; + +use heed::RoTxn; + +use super::{tokenize_document::DocumentTokenizer, SearchableExtractor}; +use crate::{ + update::{ + new::{extract::cache::CboCachedSorter, DocumentChange}, + MergeDeladdCboRoaringBitmaps, + }, + FieldId, GlobalFieldsIdsMap, Index, Result, +}; + +const MAX_COUNTED_WORDS: usize = 30; + +pub struct FidWordCountDocidsExtractor; +impl SearchableExtractor for FidWordCountDocidsExtractor { + fn attributes_to_extract<'a>( + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + index.user_defined_searchable_fields(rtxn).map_err(Into::into) + } + + fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + Ok(vec![]) + } + + /// This case is unreachable because extract_document_change has been reimplemented to not call this function. + fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> { + unreachable!() + } + + // This method is reimplemented to count the number of words in the document in each field + // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS. + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + document_tokenizer: &DocumentTokenizer, + fields_ids_map: &mut GlobalFieldsIdsMap, + cached_sorter: &mut CboCachedSorter, + document_change: DocumentChange, + ) -> Result<()> { + let mut key_buffer = Vec::new(); + match document_change { + DocumentChange::Deletion(inner) => { + let mut fid_word_count = HashMap::new(); + let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1); + Ok(()) + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut token_fn, + )?; + + // The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are deleted. + for (fid, count) in fid_word_count.iter() { + if *count <= MAX_COUNTED_WORDS { + let key = build_key(*fid, *count as u8, &mut key_buffer); + /// TODO manage the error + cached_sorter.insert_del_u32(key, inner.docid()).unwrap(); + } + } + } + DocumentChange::Update(inner) => { + let mut fid_word_count = HashMap::new(); + let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + fid_word_count + .entry(fid) + .and_modify(|(current_count, new_count)| *current_count += 1) + .or_insert((1, 0)); + Ok(()) + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut token_fn, + )?; + + let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + fid_word_count + .entry(fid) + .and_modify(|(current_count, new_count)| *new_count += 1) + .or_insert((0, 1)); + Ok(()) + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + + // Only the fields that have a change in the number of words are updated. + for (fid, (current_count, new_count)) in fid_word_count.iter() { + if *current_count != *new_count { + if *current_count <= MAX_COUNTED_WORDS { + let key = build_key(*fid, *current_count as u8, &mut key_buffer); + /// TODO manage the error + cached_sorter.insert_del_u32(key, inner.docid()).unwrap(); + } + if *new_count <= MAX_COUNTED_WORDS { + let key = build_key(*fid, *new_count as u8, &mut key_buffer); + /// TODO manage the error + cached_sorter.insert_add_u32(key, inner.docid()).unwrap(); + } + } + } + } + DocumentChange::Insertion(inner) => { + let mut fid_word_count = HashMap::new(); + let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1); + Ok(()) + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + + // The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are stored. + for (fid, count) in fid_word_count.iter() { + if *count <= MAX_COUNTED_WORDS { + let key = build_key(*fid, *count as u8, &mut key_buffer); + /// TODO manage the error + cached_sorter.insert_add_u32(key, inner.docid()).unwrap(); + } + } + } + } + + Ok(()) + } +} + +fn build_key(fid: FieldId, count: u8, key_buffer: &mut Vec) -> &[u8] { + key_buffer.clear(); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + key_buffer.push(count); + key_buffer.as_slice() +} diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs new file mode 100644 index 000000000..e170a6486 --- /dev/null +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -0,0 +1,182 @@ +use std::{ + borrow::Cow, + collections::{BTreeMap, VecDeque}, +}; + +use heed::RoTxn; +use itertools::merge_join_by; +use obkv::KvReader; + +use super::{tokenize_document::DocumentTokenizer, SearchableExtractor}; +use crate::{ + proximity::{index_proximity, MAX_DISTANCE}, + update::{ + new::{extract::cache::CboCachedSorter, DocumentChange}, + MergeDeladdCboRoaringBitmaps, + }, + FieldId, GlobalFieldsIdsMap, Index, Result, +}; + +pub struct WordPairProximityDocidsExtractor; +impl SearchableExtractor for WordPairProximityDocidsExtractor { + fn attributes_to_extract<'a>( + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + index.user_defined_searchable_fields(rtxn).map_err(Into::into) + } + + fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + Ok(vec![]) + } + + /// This case is unreachable because extract_document_change has been reimplemented to not call this function. + fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> { + unreachable!() + } + + // This method is reimplemented to count the number of words in the document in each field + // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS. + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + document_tokenizer: &DocumentTokenizer, + fields_ids_map: &mut GlobalFieldsIdsMap, + cached_sorter: &mut CboCachedSorter, + document_change: DocumentChange, + ) -> Result<()> { + /// TODO: mutualize those buffers + let mut key_buffer = Vec::new(); + let mut add_word_pair_proximity = BTreeMap::new(); + let mut del_word_pair_proximity = BTreeMap::new(); + let mut word_positions: VecDeque<(String, u16)> = + VecDeque::with_capacity(MAX_DISTANCE as usize); + + let docid = document_change.docid(); + match document_change { + DocumentChange::Deletion(inner) => { + let document = inner.current(rtxn, index)?.unwrap(); + process_document_tokens( + document, + document_tokenizer, + fields_ids_map, + &mut word_positions, + &mut del_word_pair_proximity, + )?; + } + DocumentChange::Update(inner) => { + let document = inner.current(rtxn, index)?.unwrap(); + process_document_tokens( + &document, + document_tokenizer, + fields_ids_map, + &mut word_positions, + &mut del_word_pair_proximity, + )?; + let document = inner.new(); + process_document_tokens( + document, + document_tokenizer, + fields_ids_map, + &mut word_positions, + &mut add_word_pair_proximity, + )?; + } + DocumentChange::Insertion(inner) => { + let document = inner.new(); + process_document_tokens( + document, + document_tokenizer, + fields_ids_map, + &mut word_positions, + &mut add_word_pair_proximity, + )?; + } + } + + use itertools::EitherOrBoth::*; + for eob in + merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| { + d.cmp(a) + }) + { + match eob { + Left(((w1, w2), prox)) => { + let key = build_key(*prox, w1, w2, &mut key_buffer); + cached_sorter.insert_del_u32(key, docid).unwrap(); + } + Right(((w1, w2), prox)) => { + let key = build_key(*prox, w1, w2, &mut key_buffer); + cached_sorter.insert_add_u32(key, docid).unwrap(); + } + Both(((w1, w2), del_prox), (_, add_prox)) => { + if del_prox != add_prox { + let key = build_key(*del_prox, w1, w2, &mut key_buffer); + cached_sorter.insert_del_u32(key, docid).unwrap(); + let key = build_key(*add_prox, w1, w2, &mut key_buffer); + cached_sorter.insert_add_u32(key, docid).unwrap(); + } + } + }; + } + + Ok(()) + } +} + +fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec) -> &'a [u8] { + key_buffer.clear(); + key_buffer.push(prox); + key_buffer.extend_from_slice(w1.as_bytes()); + key_buffer.push(0); + key_buffer.extend_from_slice(w2.as_bytes()); + key_buffer.as_slice() +} + +fn word_positions_into_word_pair_proximity( + word_positions: &mut VecDeque<(String, u16)>, + word_pair_proximity: &mut BTreeMap<(String, String), u8>, +) -> Result<()> { + let (head_word, head_position) = word_positions.pop_front().unwrap(); + for (word, position) in word_positions.iter() { + let prox = index_proximity(head_position as u32, *position as u32) as u8; + if prox > 0 && prox < MAX_DISTANCE as u8 { + word_pair_proximity + .entry((head_word.clone(), word.clone())) + .and_modify(|p| { + *p = std::cmp::min(*p, prox); + }) + .or_insert(prox); + } + } + Ok(()) +} + +fn process_document_tokens( + document: &KvReader, + document_tokenizer: &DocumentTokenizer, + fields_ids_map: &mut GlobalFieldsIdsMap, + word_positions: &mut VecDeque<(String, u16)>, + word_pair_proximity: &mut BTreeMap<(String, String), u8>, +) -> Result<()> { + let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + // drain the proximity window until the head word is considered close to the word we are inserting. + while word_positions + .front() + .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE) + { + word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?; + } + + // insert the new word. + word_positions.push_back((word.to_string(), pos)); + Ok(()) + }; + document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?; + + while !word_positions.is_empty() { + word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?; + } + + Ok(()) +} diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 078d06150..ba4731d73 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -1,13 +1,17 @@ +mod extract_fid_word_count_docids; mod extract_word_docids; +mod extract_word_pair_proximity_docids; mod tokenize_document; use std::borrow::Cow; use std::fs::File; +pub use extract_fid_word_count_docids::FidWordCountDocidsExtractor; pub use extract_word_docids::{ ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor, WordPositionDocidsExtractor, }; +pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; use grenad::Merger; use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelIterator}; diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index 1d19354db..7e23c9301 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use serde_json::Value; +use crate::proximity::MAX_DISTANCE; use crate::update::new::extract::perm_json_p::{ seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, }; @@ -43,8 +44,10 @@ impl<'a> DocumentTokenizer<'a> { return Err(UserError::AttributeLimitReached.into()); }; - let position = - field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0); + let position = field_position + .entry(field_id) + .and_modify(|counter| *counter += MAX_DISTANCE) + .or_insert(0); if *position as u32 >= self.max_positions_per_attributes { return Ok(()); } @@ -116,19 +119,19 @@ impl<'a> DocumentTokenizer<'a> { } /// take an iterator on tokens and compute their relative position depending on separator kinds -/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// if it's an `Hard` separator we add an additional relative proximity of MAX_DISTANCE between words, /// else we keep the standard proximity of 1 between words. fn process_tokens<'a>( - start_offset: usize, + start_offset: u32, tokens: impl Iterator>, -) -> impl Iterator)> { +) -> impl Iterator)> { tokens .skip_while(|token| token.is_separator()) .scan((start_offset, None), |(offset, prev_kind), mut token| { match token.kind { TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { *offset += match *prev_kind { - Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(TokenKind::Separator(SeparatorKind::Hard)) => MAX_DISTANCE, Some(_) => 1, None => 0, }; @@ -246,7 +249,7 @@ mod test { ]: "doggo", [ 2, - 8, + MAX_DISTANCE, ]: "doggo", [ 2, From 0fc02f7351fa7d46b0d75f615b07cb6509729b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 5 Sep 2024 10:32:22 +0200 Subject: [PATCH 042/247] Move the facet extraction to dedicated modules --- .../new/extract/faceted/extract_facets.rs | 137 ++++++++ .../new/extract/faceted/facet_document.rs | 51 +++ milli/src/update/new/extract/faceted/mod.rs | 315 +++++------------- milli/src/update/new/extract/mod.rs | 5 +- 4 files changed, 271 insertions(+), 237 deletions(-) create mode 100644 milli/src/update/new/extract/faceted/extract_facets.rs create mode 100644 milli/src/update/new/extract/faceted/facet_document.rs diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs new file mode 100644 index 000000000..9471c753b --- /dev/null +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -0,0 +1,137 @@ +use std::collections::HashSet; + +use heed::RoTxn; +use serde_json::Value; + +use super::FacetedExtractor; +use crate::facet::value_encoding::f64_into_bytes; +use crate::{normalize_facet, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; + +pub struct FieldIdFacetNumberDocidsExtractor; + +impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let number = value.as_number()?; + let n = number.as_f64()?; + let ordered = f64_into_bytes(n)?; + + // fid - level - orderedf64 - orignalf64 + output.extend_from_slice(&field_id.to_be_bytes()); + output.push(1); // level 0 + output.extend_from_slice(&ordered); + output.extend_from_slice(&n.to_be_bytes()); + + Some(&*output) + } +} + +pub struct FieldIdFacetStringDocidsExtractor; + +impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let string = value.as_str()?; + let normalize = normalize_facet(string); + let truncated = truncate_str(&normalize); + + // fid - level - normalized string + output.extend_from_slice(&field_id.to_be_bytes()); + output.push(1); // level 0 + output.extend_from_slice(truncated.as_bytes()); + + Some(&*output) + } +} + +/// Truncates a string to the biggest valid LMDB key size. +fn truncate_str(s: &str) -> &str { + let index = s + .char_indices() + .map(|(idx, _)| idx) + .chain(std::iter::once(s.len())) + .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH) + .last(); + + &s[..index.unwrap_or(0)] +} + +pub struct FieldIdFacetIsNullDocidsExtractor; + +impl FacetedExtractor for FieldIdFacetIsNullDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + if value.is_null() { + output.extend_from_slice(&field_id.to_be_bytes()); + Some(&*output) + } else { + None + } + } +} + +pub struct FieldIdFacetExistsDocidsExtractor; + +impl FacetedExtractor for FieldIdFacetExistsDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + _value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + output.extend_from_slice(&field_id.to_be_bytes()); + Some(&*output) + } +} + +pub struct FieldIdFacetIsEmptyDocidsExtractor; + +impl FacetedExtractor for FieldIdFacetIsEmptyDocidsExtractor { + fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + index.user_defined_faceted_fields(rtxn) + } + + fn build_key<'b>( + field_id: FieldId, + value: &Value, + output: &'b mut Vec, + ) -> Option<&'b [u8]> { + let is_empty = match value { + Value::Null | Value::Bool(_) | Value::Number(_) => false, + Value::String(s) => s.is_empty(), + Value::Array(a) => a.is_empty(), + Value::Object(o) => o.is_empty(), + }; + + if is_empty { + output.extend_from_slice(&field_id.to_be_bytes()); + Some(&*output) + } else { + None + } + } +} diff --git a/milli/src/update/new/extract/faceted/facet_document.rs b/milli/src/update/new/extract/faceted/facet_document.rs new file mode 100644 index 000000000..849fa8f29 --- /dev/null +++ b/milli/src/update/new/extract/faceted/facet_document.rs @@ -0,0 +1,51 @@ +use serde_json::Value; + +use crate::update::new::KvReaderFieldId; +use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; + +pub fn extract_document_facets( + attributes_to_extract: &[&str], + obkv: &KvReaderFieldId, + field_id_map: &mut GlobalFieldsIdsMap, + facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>, +) -> Result<()> { + let mut field_name = String::new(); + for (field_id, field_bytes) in obkv { + let Some(field_name) = field_id_map.name(field_id).map(|s| { + field_name.clear(); + field_name.push_str(s); + &field_name + }) else { + unreachable!("field id not found in field id map"); + }; + + let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) { + Some(field_id) => facet_fn(field_id, value), + None => Err(UserError::AttributeLimitReached.into()), + }; + + // if the current field is searchable or contains a searchable attribute + if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) { + // parse json. + match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { + Value::Object(object) => perm_json_p::seek_leaf_values_in_object( + &object, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + &mut tokenize_field, + )?, + Value::Array(array) => perm_json_p::seek_leaf_values_in_array( + &array, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + &mut tokenize_field, + )?, + value => tokenize_field(field_name, &value)?, + } + } + } + + Ok(()) +} diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index b54219fd3..19aeb031c 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -1,20 +1,19 @@ use std::collections::HashSet; +use std::fmt::Debug; use std::fs::File; -use grenad::Merger; +use grenad::{MergeFunction, Merger}; use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use serde_json::Value; use super::cache::CboCachedSorter; -use super::perm_json_p; -use crate::facet::value_encoding::f64_into_bytes; -use crate::update::new::{DocumentChange, ItemsPool, KvReaderFieldId}; +use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{ - normalize_facet, FieldId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError, - MAX_FACET_VALUE_LENGTH, -}; +use crate::{DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result}; + +mod extract_facets; +mod facet_document; pub trait FacetedExtractor { fn run_extraction( @@ -74,6 +73,27 @@ pub trait FacetedExtractor { Ok(builder.build()) } + // TODO Shorten this + fn facet_fn_with_options( + buffer: &mut Vec, + cached_sorter: &mut CboCachedSorter, + cache_fn: impl Fn(&mut CboCachedSorter, &[u8], u32) -> grenad::Result<(), MF::Error>, + docid: DocumentId, + fid: FieldId, + value: &Value, + ) -> Result<()> + where + MF: MergeFunction, + MF::Error: Debug, + { + buffer.clear(); + match Self::build_key(fid, value, buffer) { + // TODO manage errors + Some(key) => Ok(cache_fn(cached_sorter, &key, docid).unwrap()), + None => Ok(()), + } + } + fn extract_document_change( rtxn: &RoTxn, index: &Index, @@ -84,73 +104,69 @@ pub trait FacetedExtractor { document_change: DocumentChange, ) -> Result<()> { match document_change { - DocumentChange::Deletion(inner) => { - let mut facet_del_fn = |fid, value: &Value| -> Result<()> { - buffer.clear(); - match Self::build_key(fid, value, buffer) { - // TODO manage errors - Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()), - None => Ok(()), - } - }; - - extract_document_facets( - attributes_to_extract, - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut facet_del_fn, - ) - } + DocumentChange::Deletion(inner) => facet_document::extract_document_facets( + attributes_to_extract, + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_del_u32, + inner.docid(), + fid, + value, + ) + }, + ), DocumentChange::Update(inner) => { - let mut facet_del_fn = |fid, value: &Value| -> Result<()> { - buffer.clear(); - match Self::build_key(fid, value, buffer) { - // TODO manage errors - Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()), - None => Ok(()), - } - }; - - extract_document_facets( + facet_document::extract_document_facets( attributes_to_extract, inner.current(rtxn, index)?.unwrap(), fields_ids_map, - &mut facet_del_fn, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_del_u32, + inner.docid(), + fid, + value, + ) + }, )?; - let mut facet_add_fn = |fid, value: &Value| -> Result<()> { - buffer.clear(); - match Self::build_key(fid, value, buffer) { - // TODO manage errors - Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()), - None => Ok(()), - } - }; - - extract_document_facets( + facet_document::extract_document_facets( attributes_to_extract, inner.new(), fields_ids_map, - &mut facet_add_fn, - ) - } - DocumentChange::Insertion(inner) => { - let mut facet_add_fn = |fid, value: &Value| -> Result<()> { - buffer.clear(); - match Self::build_key(fid, value, buffer) { - // TODO manage errors - Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()), - None => Ok(()), - } - }; - - extract_document_facets( - attributes_to_extract, - inner.new(), - fields_ids_map, - &mut facet_add_fn, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_add_u32, + inner.docid(), + fid, + value, + ) + }, ) } + DocumentChange::Insertion(inner) => facet_document::extract_document_facets( + attributes_to_extract, + inner.new(), + fields_ids_map, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_add_u32, + inner.docid(), + fid, + value, + ) + }, + ), } } @@ -160,174 +176,3 @@ pub trait FacetedExtractor { fn build_key<'b>(field_id: FieldId, value: &Value, output: &'b mut Vec) -> Option<&'b [u8]>; } - -pub struct FieldIdFacetNumberDocidsExtractor; -impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - let number = value.as_number()?; - let n = number.as_f64()?; - let ordered = f64_into_bytes(n)?; - - // fid - level - orderedf64 - orignalf64 - output.extend_from_slice(&field_id.to_be_bytes()); - output.push(1); // level 0 - output.extend_from_slice(&ordered); - output.extend_from_slice(&n.to_be_bytes()); - - Some(&*output) - } -} - -pub struct FieldIdFacetStringDocidsExtractor; -impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - let string = value.as_str()?; - let normalize = normalize_facet(string); - let truncated = truncate_str(&normalize); - - // fid - level - normalized string - output.extend_from_slice(&field_id.to_be_bytes()); - output.push(1); // level 0 - output.extend_from_slice(truncated.as_bytes()); - - Some(&*output) - } -} - -pub struct FieldIdFacetIsNullDocidsExtractor; -impl FacetedExtractor for FieldIdFacetIsNullDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - if value.is_null() { - output.extend_from_slice(&field_id.to_be_bytes()); - Some(&*output) - } else { - None - } - } -} - -pub struct FieldIdFacetExistsDocidsExtractor; -impl FacetedExtractor for FieldIdFacetExistsDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - _value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - output.extend_from_slice(&field_id.to_be_bytes()); - Some(&*output) - } -} - -pub struct FieldIdFacetIsEmptyDocidsExtractor; -impl FacetedExtractor for FieldIdFacetIsEmptyDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - let is_empty = match value { - Value::Null | Value::Bool(_) | Value::Number(_) => false, - Value::String(s) => s.is_empty(), - Value::Array(a) => a.is_empty(), - Value::Object(o) => o.is_empty(), - }; - - if is_empty { - output.extend_from_slice(&field_id.to_be_bytes()); - Some(&*output) - } else { - None - } - } -} - -pub fn extract_document_facets( - attributes_to_extract: &[&str], - obkv: &KvReaderFieldId, - field_id_map: &mut GlobalFieldsIdsMap, - facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>, -) -> Result<()> { - let mut field_name = String::new(); - for (field_id, field_bytes) in obkv { - let Some(field_name) = field_id_map.name(field_id).map(|s| { - field_name.clear(); - field_name.push_str(s); - &field_name - }) else { - unreachable!("field id not found in field id map"); - }; - - let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) { - Some(field_id) => facet_fn(field_id, value), - None => Err(UserError::AttributeLimitReached.into()), - }; - - // if the current field is searchable or contains a searchable attribute - if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) { - // parse json. - match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { - Value::Object(object) => perm_json_p::seek_leaf_values_in_object( - &object, - Some(attributes_to_extract), - &[], // skip no attributes - field_name, - &mut tokenize_field, - )?, - Value::Array(array) => perm_json_p::seek_leaf_values_in_array( - &array, - Some(attributes_to_extract), - &[], // skip no attributes - field_name, - &mut tokenize_field, - )?, - value => tokenize_field(field_name, &value)?, - } - } - } - - Ok(()) -} - -/// Truncates a string to the biggest valid LMDB key size. -fn truncate_str(s: &str) -> &str { - let index = s - .char_indices() - .map(|(idx, _)| idx) - .chain(std::iter::once(s.len())) - .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH) - .last(); - - &s[..index.unwrap_or(0)] -} diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 69081e251..e50e70c47 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -2,11 +2,12 @@ mod cache; mod faceted; mod searchable; -pub use faceted::{ - FacetedExtractor, FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor, +pub use faceted::modname::{ + FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor, FieldIdFacetIsNullDocidsExtractor, FieldIdFacetNumberDocidsExtractor, FieldIdFacetStringDocidsExtractor, }; +pub use faceted::FacetedExtractor; pub use searchable::{ ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, WordPositionDocidsExtractor, From 73ce67862df4f71bffa752292a607488f8a9bac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 5 Sep 2024 10:56:22 +0200 Subject: [PATCH 043/247] Use the word pair proximity and fid word count docids extractors Co-authored-by: ManyTheFish --- milli/src/update/new/channel.rs | 58 ++++-- .../new/extract/faceted/facet_document.rs | 1 + milli/src/update/new/extract/faceted/mod.rs | 1 + milli/src/update/new/extract/mod.rs | 12 +- .../extract_fid_word_count_docids.rs | 34 ++-- .../extract/searchable/extract_word_docids.rs | 8 +- .../extract_word_pair_proximity_docids.rs | 29 ++- milli/src/update/new/indexer/mod.rs | 169 ++++++++++-------- milli/src/update/new/merger.rs | 50 ++++-- 9 files changed, 205 insertions(+), 157 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index e9a795bf5..3eafb7754 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -112,23 +112,27 @@ pub struct WriterOperation { } pub enum Database { - WordDocids, - ExactWordDocids, - WordFidDocids, - WordPositionDocids, Documents, + ExactWordDocids, + FidWordCountDocids, Main, + WordDocids, + WordFidDocids, + WordPairProximityDocids, + WordPositionDocids, } impl WriterOperation { pub fn database(&self, index: &Index) -> heed::Database { match self.database { - Database::Main => index.main.remap_types(), Database::Documents => index.documents.remap_types(), - Database::WordDocids => index.word_docids.remap_types(), Database::ExactWordDocids => index.exact_word_docids.remap_types(), + Database::Main => index.main.remap_types(), + Database::WordDocids => index.word_docids.remap_types(), Database::WordFidDocids => index.word_fid_docids.remap_types(), Database::WordPositionDocids => index.word_position_docids.remap_types(), + Database::FidWordCountDocids => index.field_id_word_count_docids.remap_types(), + Database::WordPairProximityDocids => index.word_pair_proximity_docids.remap_types(), } } @@ -198,9 +202,11 @@ impl MainSender<'_> { } } -pub enum WordDocids {} pub enum ExactWordDocids {} +pub enum FidWordCountDocids {} +pub enum WordDocids {} pub enum WordFidDocids {} +pub enum WordPairProximityDocids {} pub enum WordPositionDocids {} pub trait DatabaseType { @@ -209,14 +215,6 @@ pub trait DatabaseType { fn new_merger_operation(merger: Merger) -> MergerOperation; } -impl DatabaseType for WordDocids { - const DATABASE: Database = Database::WordDocids; - - fn new_merger_operation(merger: Merger) -> MergerOperation { - MergerOperation::WordDocidsMerger(merger) - } -} - impl DatabaseType for ExactWordDocids { const DATABASE: Database = Database::ExactWordDocids; @@ -225,6 +223,22 @@ impl DatabaseType for ExactWordDocids { } } +impl DatabaseType for FidWordCountDocids { + const DATABASE: Database = Database::FidWordCountDocids; + + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::FidWordCountDocidsMerger(merger) + } +} + +impl DatabaseType for WordDocids { + const DATABASE: Database = Database::WordDocids; + + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::WordDocidsMerger(merger) + } +} + impl DatabaseType for WordFidDocids { const DATABASE: Database = Database::WordFidDocids; @@ -233,6 +247,14 @@ impl DatabaseType for WordFidDocids { } } +impl DatabaseType for WordPairProximityDocids { + const DATABASE: Database = Database::WordPairProximityDocids; + + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::WordPairProximityDocidsMerger(merger) + } +} + impl DatabaseType for WordPositionDocids { const DATABASE: Database = Database::WordPositionDocids; @@ -293,12 +315,14 @@ impl DocumentsSender<'_> { } pub enum MergerOperation { - WordDocidsMerger(Merger), ExactWordDocidsMerger(Merger), + FidWordCountDocidsMerger(Merger), + WordDocidsMerger(Merger), WordFidDocidsMerger(Merger), + WordPairProximityDocidsMerger(Merger), WordPositionDocidsMerger(Merger), - InsertDocument { docid: DocumentId, document: Box }, DeleteDocument { docid: DocumentId }, + InsertDocument { docid: DocumentId, document: Box }, } pub struct MergerReceiver(Receiver); diff --git a/milli/src/update/new/extract/faceted/facet_document.rs b/milli/src/update/new/extract/faceted/facet_document.rs index 849fa8f29..4525e866f 100644 --- a/milli/src/update/new/extract/faceted/facet_document.rs +++ b/milli/src/update/new/extract/faceted/facet_document.rs @@ -1,5 +1,6 @@ use serde_json::Value; +use crate::update::new::extract::perm_json_p; use crate::update::new::KvReaderFieldId; use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index 19aeb031c..bd58c21b4 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -2,6 +2,7 @@ use std::collections::HashSet; use std::fmt::Debug; use std::fs::File; +pub use extract_facets::*; use grenad::{MergeFunction, Merger}; use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelIterator}; diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index e50e70c47..d6d5a3005 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -2,16 +2,8 @@ mod cache; mod faceted; mod searchable; -pub use faceted::modname::{ - FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor, - FieldIdFacetIsNullDocidsExtractor, FieldIdFacetNumberDocidsExtractor, - FieldIdFacetStringDocidsExtractor, -}; -pub use faceted::FacetedExtractor; -pub use searchable::{ - ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, - WordPositionDocidsExtractor, -}; +pub use faceted::*; +pub use searchable::*; /// TODO move in permissive json pointer pub mod perm_json_p { diff --git a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs index 08160155e..7cb11c11b 100644 --- a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs @@ -1,15 +1,14 @@ -use std::{borrow::Cow, collections::HashMap}; +use std::borrow::Cow; +use std::collections::HashMap; use heed::RoTxn; -use super::{tokenize_document::DocumentTokenizer, SearchableExtractor}; -use crate::{ - update::{ - new::{extract::cache::CboCachedSorter, DocumentChange}, - MergeDeladdCboRoaringBitmaps, - }, - FieldId, GlobalFieldsIdsMap, Index, Result, -}; +use super::tokenize_document::DocumentTokenizer; +use super::SearchableExtractor; +use crate::update::new::extract::cache::CboCachedSorter; +use crate::update::new::DocumentChange; +use crate::update::MergeDeladdCboRoaringBitmaps; +use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; const MAX_COUNTED_WORDS: usize = 30; @@ -22,12 +21,13 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { index.user_defined_searchable_fields(rtxn).map_err(Into::into) } - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { Ok(vec![]) } /// This case is unreachable because extract_document_change has been reimplemented to not call this function. - fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> { + fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> { + /// TODO remove this unreachable!() } @@ -45,7 +45,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { match document_change { DocumentChange::Deletion(inner) => { let mut fid_word_count = HashMap::new(); - let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1); Ok(()) }; @@ -66,10 +66,10 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { } DocumentChange::Update(inner) => { let mut fid_word_count = HashMap::new(); - let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { fid_word_count .entry(fid) - .and_modify(|(current_count, new_count)| *current_count += 1) + .and_modify(|(current_count, _new_count)| *current_count += 1) .or_insert((1, 0)); Ok(()) }; @@ -79,10 +79,10 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { &mut token_fn, )?; - let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { fid_word_count .entry(fid) - .and_modify(|(current_count, new_count)| *new_count += 1) + .and_modify(|(_current_count, new_count)| *new_count += 1) .or_insert((0, 1)); Ok(()) }; @@ -106,7 +106,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { } DocumentChange::Insertion(inner) => { let mut fid_word_count = HashMap::new(); - let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1); Ok(()) }; diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 70f9c4e47..db8bb7993 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -20,7 +20,7 @@ impl SearchableExtractor for WordDocidsExtractor { } /// TODO write in an external Vec buffer - fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { + fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { Cow::Borrowed(word.as_bytes()) } } @@ -49,7 +49,7 @@ impl SearchableExtractor for ExactWordDocidsExtractor { Ok(vec![]) } - fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { + fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { Cow::Borrowed(word.as_bytes()) } } @@ -67,7 +67,7 @@ impl SearchableExtractor for WordFidDocidsExtractor { Ok(vec![]) } - fn build_key<'a>(field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { + fn build_key(field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { let mut key = Vec::new(); key.extend_from_slice(word.as_bytes()); key.push(0); @@ -89,7 +89,7 @@ impl SearchableExtractor for WordPositionDocidsExtractor { Ok(vec![]) } - fn build_key<'a>(_field_id: FieldId, position: u16, word: &'a str) -> Cow<'a, [u8]> { + fn build_key(_field_id: FieldId, position: u16, word: &str) -> Cow<[u8]> { // position must be bucketed to reduce the number of keys in the DB. let position = bucketed_position(position); let mut key = Vec::new(); diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index e170a6486..e9de6e9f2 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -1,21 +1,17 @@ -use std::{ - borrow::Cow, - collections::{BTreeMap, VecDeque}, -}; +use std::borrow::Cow; +use std::collections::{BTreeMap, VecDeque}; use heed::RoTxn; use itertools::merge_join_by; use obkv::KvReader; -use super::{tokenize_document::DocumentTokenizer, SearchableExtractor}; -use crate::{ - proximity::{index_proximity, MAX_DISTANCE}, - update::{ - new::{extract::cache::CboCachedSorter, DocumentChange}, - MergeDeladdCboRoaringBitmaps, - }, - FieldId, GlobalFieldsIdsMap, Index, Result, -}; +use super::tokenize_document::DocumentTokenizer; +use super::SearchableExtractor; +use crate::proximity::{index_proximity, MAX_DISTANCE}; +use crate::update::new::extract::cache::CboCachedSorter; +use crate::update::new::DocumentChange; +use crate::update::MergeDeladdCboRoaringBitmaps; +use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; pub struct WordPairProximityDocidsExtractor; impl SearchableExtractor for WordPairProximityDocidsExtractor { @@ -26,12 +22,13 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { index.user_defined_searchable_fields(rtxn).map_err(Into::into) } - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { Ok(vec![]) } /// This case is unreachable because extract_document_change has been reimplemented to not call this function. - fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> { + fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> { + /// TODO remove this unreachable!() } @@ -159,7 +156,7 @@ fn process_document_tokens( word_positions: &mut VecDeque<(String, u16)>, word_pair_proximity: &mut BTreeMap<(String, String), u8>, ) -> Result<()> { - let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |_fid: FieldId, pos: u16, word: &str| { // drain the proximity window until the head word is considered close to the word we are inserting. while word_positions .front() diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index ed42c03b1..d721a5511 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -11,15 +11,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::ThreadPool; pub use update_by_function::UpdateByFunction; -use super::channel::{ - extractors_merger_channels, merger_writer_channel, EntryOperation, ExactWordDocids, WordDocids, - WordFidDocids, WordPositionDocids, -}; +use super::channel::*; use super::document_change::DocumentChange; -use super::extract::{ - ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, - WordPositionDocidsExtractor, -}; +use super::extract::*; use super::merger::merge_grenad_entries; use super::StdResult; use crate::documents::{ @@ -71,79 +65,98 @@ where // TODO manage the errors correctly let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { - let document_changes = document_changes.into_par_iter(); + let document_changes = document_changes.into_par_iter(); - // document but we need to create a function that collects and compresses documents. - document_changes.clone().into_par_iter().try_for_each(|result| { - match result? { - DocumentChange::Deletion(deletion) => { - let docid = deletion.docid(); - extractor_sender.document_delete(docid).unwrap(); + // document but we need to create a function that collects and compresses documents. + document_changes.clone().into_par_iter().try_for_each(|result| { + match result? { + DocumentChange::Deletion(deletion) => { + let docid = deletion.docid(); + extractor_sender.document_delete(docid).unwrap(); + } + DocumentChange::Update(update) => { + let docid = update.docid(); + let content = update.new(); + extractor_sender.document_insert(docid, content.boxed()).unwrap(); + } + DocumentChange::Insertion(insertion) => { + let docid = insertion.docid(); + let content = insertion.new(); + extractor_sender.document_insert(docid, content.boxed()).unwrap(); + // extracted_dictionary_sender.send(self, dictionary: &[u8]); + } } - DocumentChange::Update(update) => { - let docid = update.docid(); - let content = update.new(); - extractor_sender.document_insert(docid, content.boxed()).unwrap(); - } - DocumentChange::Insertion(insertion) => { - let docid = insertion.docid(); - let content = insertion.new(); - extractor_sender.document_insert(docid, content.boxed()).unwrap(); - // extracted_dictionary_sender.send(self, dictionary: &[u8]); - } - } + Ok(()) as Result<_> + })?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::< + WordPairProximityDocidsExtractor, + WordPairProximityDocids, + >( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + // TODO THIS IS TOO MUCH + // Extract fieldid docid facet number + // Extract fieldid docid facet string + // Extract facetid string fst + // Extract facetid normalized string strings + + // TODO Inverted Indexes again + // Extract fieldid facet isempty docids + // Extract fieldid facet isnull docids + // Extract fieldid facet exists docids + + // TODO This is the normal system + // Extract fieldid facet number docids + // Extract fieldid facet string docids + Ok(()) as Result<_> - })?; - - extract_and_send_docids::( - index, - &global_fields_ids_map, - GrenadParameters::default(), - document_changes.clone(), - &extractor_sender, - )?; - - extract_and_send_docids::( - index, - &global_fields_ids_map, - GrenadParameters::default(), - document_changes.clone(), - &extractor_sender, - )?; - - extract_and_send_docids::( - index, - &global_fields_ids_map, - GrenadParameters::default(), - document_changes.clone(), - &extractor_sender, - )?; - - extract_and_send_docids::( - index, - &global_fields_ids_map, - GrenadParameters::default(), - document_changes.clone(), - &extractor_sender, - )?; - - // TODO THIS IS TOO MUCH - // Extract fieldid docid facet number - // Extract fieldid docid facet string - // Extract facetid string fst - // Extract facetid normalized string strings - - // TODO Inverted Indexes again - // Extract fieldid facet isempty docids - // Extract fieldid facet isnull docids - // Extract fieldid facet exists docids - - // TODO This is the normal system - // Extract fieldid facet number docids - // Extract fieldid facet string docids - - Ok(()) as Result<_> - }) + }) })?; // TODO manage the errors correctly diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 25f09441c..19f56a301 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -8,10 +8,7 @@ use memmap2::Mmap; use roaring::RoaringBitmap; use tempfile::tempfile; -use super::channel::{ - DatabaseType, DocidsSender, ExactWordDocids, MergerReceiver, MergerSender, WordDocids, - WordFidDocids, WordPositionDocids, -}; +use super::channel::*; use super::KvReaderDelAdd; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; @@ -30,6 +27,29 @@ pub fn merge_grenad_entries( for merger_operation in receiver { match merger_operation { + MergerOperation::ExactWordDocidsMerger(merger) => { + merge_and_send_docids( + merger, + /// TODO do a MergerOperation::database(&Index) -> Database. + index.exact_word_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |_key| Ok(()), + |_key| Ok(()), + )?; + } + MergerOperation::FidWordCountDocidsMerger(merger) => { + merge_and_send_docids( + merger, + index.field_id_word_count_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |_key| Ok(()), + |_key| Ok(()), + )?; + } MergerOperation::WordDocidsMerger(merger) => { let mut add_words_fst = SetBuilder::new(tempfile()?)?; let mut del_words_fst = SetBuilder::new(tempfile()?)?; @@ -49,17 +69,6 @@ pub fn merge_grenad_entries( let mmap = compute_new_words_fst(add_words_fst, del_words_fst, words_fst)?; sender.main().write_words_fst(mmap).unwrap(); } - MergerOperation::ExactWordDocidsMerger(merger) => { - merge_and_send_docids( - merger, - index.exact_word_docids.remap_types(), - rtxn, - &mut buffer, - sender.docids::(), - |_key| Ok(()), - |_key| Ok(()), - )?; - } MergerOperation::WordFidDocidsMerger(merger) => { merge_and_send_docids( merger, @@ -71,6 +80,17 @@ pub fn merge_grenad_entries( |_key| Ok(()), )?; } + MergerOperation::WordPairProximityDocidsMerger(merger) => { + merge_and_send_docids( + merger, + index.word_pair_proximity_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |_key| Ok(()), + |_key| Ok(()), + )?; + } MergerOperation::WordPositionDocidsMerger(merger) => { merge_and_send_docids( merger, From f6b3d1f9a5acf74bc88b32c1fb2bf9970cdd85d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 5 Sep 2024 15:12:07 +0200 Subject: [PATCH 044/247] Increase some channel sizes --- milli/src/update/new/extract/searchable/mod.rs | 2 +- milli/src/update/new/indexer/mod.rs | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index ba4731d73..a7498d0d9 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -66,7 +66,7 @@ pub trait SearchableExtractor { fields_ids_map.clone(), CboCachedSorter::new( // TODO use a better value - 100.try_into().unwrap(), + 1_000_000.try_into().unwrap(), create_sorter( grenad::SortAlgorithm::Stable, MergeDeladdCboRoaringBitmaps, diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index d721a5511..0865374c8 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -54,7 +54,7 @@ where PI: IntoParallelIterator> + Send, PI::Iter: Clone, { - let (merger_sender, writer_receiver) = merger_writer_channel(100); + let (merger_sender, writer_receiver) = merger_writer_channel(10_000); // This channel acts as a rendezvous point to ensure that we are one task ahead let (extractor_sender, merger_receiver) = extractors_merger_channels(0); @@ -89,10 +89,16 @@ where Ok(()) as Result<_> })?; + const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; + let max_memory = TEN_GIB / dbg!(rayon::current_num_threads()); + let grenad_parameters = GrenadParameters { + max_memory: Some(max_memory), + ..GrenadParameters::default() + }; extract_and_send_docids::( index, &global_fields_ids_map, - GrenadParameters::default(), + grenad_parameters, document_changes.clone(), &extractor_sender, )?; @@ -100,7 +106,7 @@ where extract_and_send_docids::( index, &global_fields_ids_map, - GrenadParameters::default(), + grenad_parameters, document_changes.clone(), &extractor_sender, )?; @@ -108,7 +114,7 @@ where extract_and_send_docids::( index, &global_fields_ids_map, - GrenadParameters::default(), + grenad_parameters, document_changes.clone(), &extractor_sender, )?; @@ -116,7 +122,7 @@ where extract_and_send_docids::( index, &global_fields_ids_map, - GrenadParameters::default(), + grenad_parameters, document_changes.clone(), &extractor_sender, )?; @@ -135,7 +141,7 @@ where >( index, &global_fields_ids_map, - GrenadParameters::default(), + grenad_parameters, document_changes.clone(), &extractor_sender, )?; From 8fd99b111b587714092c9267891343bdb0bebbf7 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 5 Sep 2024 17:36:19 +0200 Subject: [PATCH 045/247] Add tracing timers logs --- milli/src/update/new/extract/faceted/mod.rs | 1 + milli/src/update/new/indexer/mod.rs | 126 +++++++++++++------- milli/src/update/new/merger.rs | 36 +++++- workloads/movies.json | 2 +- 4 files changed, 115 insertions(+), 50 deletions(-) diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index bd58c21b4..62aa7adb2 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -17,6 +17,7 @@ mod extract_facets; mod facet_document; pub trait FacetedExtractor { + #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] fn run_extraction( index: &Index, fields_ids_map: &GlobalFieldsIdsMap, diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 0865374c8..5e3104de8 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -63,8 +63,11 @@ where thread::scope(|s| { // TODO manage the errors correctly + let current_span = tracing::Span::current(); let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { + let span = tracing::trace_span!(target: "indexing::documents", parent: ¤t_span, "extract"); + let _entered = span.enter(); let document_changes = document_changes.into_par_iter(); // document but we need to create a function that collects and compresses documents. @@ -95,56 +98,85 @@ where max_memory: Some(max_memory), ..GrenadParameters::default() }; - extract_and_send_docids::( - index, - &global_fields_ids_map, - grenad_parameters, - document_changes.clone(), - &extractor_sender, - )?; + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); + let _entered = span.enter(); + extract_and_send_docids::( + index, + &global_fields_ids_map, + grenad_parameters, + document_changes.clone(), + &extractor_sender, + )?; + } - extract_and_send_docids::( - index, - &global_fields_ids_map, - grenad_parameters, - document_changes.clone(), - &extractor_sender, - )?; + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_fid_docids"); + let _entered = span.enter(); + extract_and_send_docids::( + index, + &global_fields_ids_map, + grenad_parameters, + document_changes.clone(), + &extractor_sender, + )?; + } + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids"); + let _entered = span.enter(); + extract_and_send_docids::( + index, + &global_fields_ids_map, + grenad_parameters, + document_changes.clone(), + &extractor_sender, + )?; + } - extract_and_send_docids::( - index, - &global_fields_ids_map, - grenad_parameters, - document_changes.clone(), - &extractor_sender, - )?; + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids"); + let _entered = span.enter(); + extract_and_send_docids::( + index, + &global_fields_ids_map, + grenad_parameters, + document_changes.clone(), + &extractor_sender, + )?; + } - extract_and_send_docids::( - index, - &global_fields_ids_map, - grenad_parameters, - document_changes.clone(), - &extractor_sender, - )?; + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids"); + let _entered = span.enter(); + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + } - extract_and_send_docids::( - index, - &global_fields_ids_map, - GrenadParameters::default(), - document_changes.clone(), - &extractor_sender, - )?; + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); + let _entered = span.enter(); + extract_and_send_docids::< + WordPairProximityDocidsExtractor, + WordPairProximityDocids, + >( + index, + &global_fields_ids_map, + grenad_parameters, + document_changes.clone(), + &extractor_sender, + )?; + } - extract_and_send_docids::< - WordPairProximityDocidsExtractor, - WordPairProximityDocids, - >( - index, - &global_fields_ids_map, - grenad_parameters, - document_changes.clone(), - &extractor_sender, - )?; + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); + let _entered = span.enter(); + } // TODO THIS IS TOO MUCH // Extract fieldid docid facet number @@ -166,7 +198,11 @@ where })?; // TODO manage the errors correctly + let current_span = tracing::Span::current(); let handle2 = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || { + let span = + tracing::trace_span!(target: "indexing::documents", parent: ¤t_span, "merge"); + let _entered = span.enter(); let rtxn = index.read_txn().unwrap(); merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index) })?; diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 19f56a301..b38dc0865 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -16,6 +16,7 @@ use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{CboRoaringBitmapCodec, Index, Result}; /// TODO We must return some infos/stats +#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")] pub fn merge_grenad_entries( receiver: MergerReceiver, sender: MergerSender, @@ -28,6 +29,9 @@ pub fn merge_grenad_entries( for merger_operation in receiver { match merger_operation { MergerOperation::ExactWordDocidsMerger(merger) => { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); + let _entered = span.enter(); merge_and_send_docids( merger, /// TODO do a MergerOperation::database(&Index) -> Database. @@ -40,6 +44,8 @@ pub fn merge_grenad_entries( )?; } MergerOperation::FidWordCountDocidsMerger(merger) => { + let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); + let _entered = span.enter(); merge_and_send_docids( merger, index.field_id_word_count_docids.remap_types(), @@ -51,6 +57,9 @@ pub fn merge_grenad_entries( )?; } MergerOperation::WordDocidsMerger(merger) => { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); + let _entered = span.enter(); let mut add_words_fst = SetBuilder::new(tempfile()?)?; let mut del_words_fst = SetBuilder::new(tempfile()?)?; @@ -70,6 +79,9 @@ pub fn merge_grenad_entries( sender.main().write_words_fst(mmap).unwrap(); } MergerOperation::WordFidDocidsMerger(merger) => { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); + let _entered = span.enter(); merge_and_send_docids( merger, index.word_fid_docids.remap_types(), @@ -81,6 +93,8 @@ pub fn merge_grenad_entries( )?; } MergerOperation::WordPairProximityDocidsMerger(merger) => { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); + let _entered = span.enter(); merge_and_send_docids( merger, index.word_pair_proximity_docids.remap_types(), @@ -92,6 +106,8 @@ pub fn merge_grenad_entries( )?; } MergerOperation::WordPositionDocidsMerger(merger) => { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); + let _entered = span.enter(); merge_and_send_docids( merger, index.word_position_docids.remap_types(), @@ -103,10 +119,16 @@ pub fn merge_grenad_entries( )?; } MergerOperation::InsertDocument { docid, document } => { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "insert_document"); + let _entered = span.enter(); documents_ids.insert(docid); sender.documents().uncompressed(docid, &document).unwrap(); } MergerOperation::DeleteDocument { docid } => { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "delete_document"); + let _entered = span.enter(); if !documents_ids.remove(docid) { unreachable!("Tried deleting a document that we do not know about"); } @@ -115,10 +137,15 @@ pub fn merge_grenad_entries( } } - // Send the documents ids unionized with the current one - /// TODO return the slice of bytes directly - serialize_bitmap_into_vec(&documents_ids, &mut buffer); - sender.send_documents_ids(&buffer).unwrap(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "documents_ids"); + let _entered = span.enter(); + + // Send the documents ids unionized with the current one + /// TODO return the slice of bytes directly + serialize_bitmap_into_vec(&documents_ids, &mut buffer); + sender.send_documents_ids(&buffer).unwrap(); + } // ... @@ -149,6 +176,7 @@ fn compute_new_words_fst( Ok(words_fst_mmap) } +#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] fn merge_and_send_docids( merger: Merger, database: Database, diff --git a/workloads/movies.json b/workloads/movies.json index 445ff3aca..9ad3fb7eb 100644 --- a/workloads/movies.json +++ b/workloads/movies.json @@ -1,6 +1,6 @@ { "name": "movies.json", - "run_count": 10, + "run_count": 1, "extra_cli_args": [], "assets": { "movies.json": { From 10f09c531fada3d4a2734eb95f10e360c368eb92 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Sep 2024 18:22:16 +0200 Subject: [PATCH 046/247] add some commented code to read from json with raw values --- .../update/new/indexer/document_operation.rs | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index c54ffd140..29f36a82e 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -338,3 +338,122 @@ fn merge_document_for_updates( } } } + +/* + + +use std::{ + borrow::{Borrow, Cow}, + collections::BTreeMap, + ops::Deref, +}; + +use serde::Deserialize; +use serde_json::{value::RawValue, Value}; +/* +#[repr(transparent)] +pub struct my_str(str); + +impl ToOwned for my_str { + type Owned = Box; + + fn to_owned(&self) -> Self::Owned { + self.0.to_string().into_boxed_str() + } +} + +impl Borrow for Box { + fn borrow(&self) -> &my_str { + unsafe { std::mem::transmute(self.as_ref()) } + } +} +*/ + +#[derive(Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct CowKey<'doc>(#[serde(borrow)] Cow<'doc, str>); + +impl<'doc> Borrow for CowKey<'doc> { + fn borrow(&self) -> &str { + self.0.borrow() + } +} + +#[derive(Deserialize)] +pub struct TopLevelMap<'doc>(#[serde(borrow)] BTreeMap, &'doc RawValue>); + +#[derive(Deserialize)] +pub struct FlatDocs<'doc>(#[serde(borrow)] Vec<&'doc RawValue>); + +fn read_docs<'doc>( + ndjson: &'doc str, +) -> impl Iterator, serde_json::Error>> { + serde_json::Deserializer::from_str(ndjson).into_iter::() +} + +fn main() { + let ndjson_data = r#" + { + "id": { + "nested": "kefir" + }, + "name": "Alice", + "age": 30 + } + { + "id": { + "nested": "intel" + }, + "name\n": "Bob", + "age": 22 + } + "#; + + let primary_key: Vec<_> = "id.nested".split('.').collect(); // dynamic + + for doc in read_docs(ndjson_data) { + let doc = doc.unwrap(); + let docid = get_docid(&doc, &primary_key).unwrap().expect("missingno"); + println!("docid={docid}"); + } +} + +pub struct Document<'payload> { + fields: TopLevelMap<'payload>, + docid: String, +} + +/*impl<'payload> Document<'payload> { + pub fn get(name: &str) -> Option<&'payload RawValue> {} + + pub fn get_nested(name: &[&str]) {} +}*/ + +fn get_docid<'payload>( + map: &TopLevelMap<'payload>, + primary_key: &[&str], +) -> serde_json::Result>> { + match primary_key { + [] => unreachable!("arrrgh"), + [primary_key] => match map.0.get(*primary_key) { + Some(value) => { + let value = value.get(); + let value_number: Result = serde_json::from_str(value); + Ok(Some(match value_number { + Ok(value) => CowKey(Cow::Owned(value.to_string())), + Err(_) => serde_json::from_str(value)?, + })) + } + None => Ok(None), + }, + [head, tail @ ..] => match map.0.get(*head) { + Some(value) => { + let map = serde_json::from_str(value.get())?; + get_docid(&map, tail) + } + None => Ok(None), + }, + } +} + + +*/ From 8412be4a7ddadd648c3dca4c8215306aa8c2b0ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 5 Sep 2024 18:32:55 +0200 Subject: [PATCH 047/247] Cleanup CowStr and TopLevelMap struct --- milli/Cargo.toml | 2 +- .../update/new/indexer/document_operation.rs | 113 +++--------------- 2 files changed, 17 insertions(+), 98 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7059ed7f5..1fa754069 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -49,7 +49,7 @@ rayon = "1.10.0" roaring = { version = "0.10.6", features = ["serde"] } rstar = { version = "0.12.0", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } -serde_json = { version = "1.0.120", features = ["preserve_order"] } +serde_json = { version = "1.0.120", features = ["preserve_order", "raw_value"] } slice-group-by = "0.3.1" smallstr = { version = "0.3.0", features = ["serde"] } smallvec = "1.13.2" diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 29f36a82e..c30665f17 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -339,121 +339,40 @@ fn merge_document_for_updates( } } -/* - - -use std::{ - borrow::{Borrow, Cow}, - collections::BTreeMap, - ops::Deref, -}; +use std::borrow::Borrow; use serde::Deserialize; -use serde_json::{value::RawValue, Value}; -/* -#[repr(transparent)] -pub struct my_str(str); +use serde_json::from_str; +use serde_json::value::RawValue; -impl ToOwned for my_str { - type Owned = Box; - - fn to_owned(&self) -> Self::Owned { - self.0.to_string().into_boxed_str() - } -} - -impl Borrow for Box { - fn borrow(&self) -> &my_str { - unsafe { std::mem::transmute(self.as_ref()) } - } -} -*/ +#[derive(Deserialize)] +pub struct TopLevelMap<'p>(#[serde(borrow)] BTreeMap, &'p RawValue>); #[derive(Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct CowKey<'doc>(#[serde(borrow)] Cow<'doc, str>); +pub struct CowStr<'p>(#[serde(borrow)] Cow<'p, str>); -impl<'doc> Borrow for CowKey<'doc> { +impl<'doc> Borrow for CowStr<'doc> { fn borrow(&self) -> &str { self.0.borrow() } } -#[derive(Deserialize)] -pub struct TopLevelMap<'doc>(#[serde(borrow)] BTreeMap, &'doc RawValue>); - -#[derive(Deserialize)] -pub struct FlatDocs<'doc>(#[serde(borrow)] Vec<&'doc RawValue>); - -fn read_docs<'doc>( - ndjson: &'doc str, -) -> impl Iterator, serde_json::Error>> { - serde_json::Deserializer::from_str(ndjson).into_iter::() -} - -fn main() { - let ndjson_data = r#" - { - "id": { - "nested": "kefir" - }, - "name": "Alice", - "age": 30 - } - { - "id": { - "nested": "intel" - }, - "name\n": "Bob", - "age": 22 - } - "#; - - let primary_key: Vec<_> = "id.nested".split('.').collect(); // dynamic - - for doc in read_docs(ndjson_data) { - let doc = doc.unwrap(); - let docid = get_docid(&doc, &primary_key).unwrap().expect("missingno"); - println!("docid={docid}"); - } -} - -pub struct Document<'payload> { - fields: TopLevelMap<'payload>, - docid: String, -} - -/*impl<'payload> Document<'payload> { - pub fn get(name: &str) -> Option<&'payload RawValue> {} - - pub fn get_nested(name: &[&str]) {} -}*/ - -fn get_docid<'payload>( - map: &TopLevelMap<'payload>, +fn get_docid<'p>( + map: &TopLevelMap<'p>, primary_key: &[&str], -) -> serde_json::Result>> { +) -> serde_json::Result>> { match primary_key { - [] => unreachable!("arrrgh"), + [] => unreachable!("arrrgh"), // would None be ok? [primary_key] => match map.0.get(*primary_key) { - Some(value) => { - let value = value.get(); - let value_number: Result = serde_json::from_str(value); - Ok(Some(match value_number { - Ok(value) => CowKey(Cow::Owned(value.to_string())), - Err(_) => serde_json::from_str(value)?, - })) - } + Some(value) => match from_str::(value.get()) { + Ok(value) => Ok(Some(CowStr(Cow::Owned(value.to_string())))), + Err(_) => Ok(Some(from_str(value.get())?)), + }, None => Ok(None), }, [head, tail @ ..] => match map.0.get(*head) { - Some(value) => { - let map = serde_json::from_str(value.get())?; - get_docid(&map, tail) - } + Some(value) => get_docid(&from_str(value.get())?, tail), None => Ok(None), }, } } - - -*/ From 72c6a21a30fb3db20be4cc6f10eec4c1f16bb68e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 5 Sep 2024 20:08:23 +0200 Subject: [PATCH 048/247] Use raw JSON to read the payloads --- Cargo.lock | 1 + index-scheduler/Cargo.toml | 1 + index-scheduler/src/batch.rs | 43 ++++- meilisearch/src/routes/indexes/documents.rs | 5 +- .../update/new/indexer/document_operation.rs | 175 ++++++++++-------- 5 files changed, 131 insertions(+), 94 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e169dbd52..6eb12d80f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2570,6 +2570,7 @@ dependencies = [ "meili-snap", "meilisearch-auth", "meilisearch-types", + "memmap2", "page_size", "rayon", "roaring", diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 6f099a025..cb37c9151 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -29,6 +29,7 @@ serde_json = { version = "1.0.120", features = ["preserve_order"] } synchronoise = "1.0.1" tempfile = "3.10.1" thiserror = "1.0.61" +memmap2 = "0.9.4" time = { version = "0.3.36", features = [ "serde-well-known", "formatting", diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 129dbec10..ba99eb418 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -18,6 +18,7 @@ one indexing operation. */ use std::collections::{BTreeSet, HashSet}; +use std::env::VarError; use std::ffi::OsStr; use std::fmt; use std::fs::{self, File}; @@ -26,7 +27,7 @@ use std::io::BufWriter; use dump::IndexMetadata; use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; -use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; +use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::new::indexer::{self, guess_primary_key, DocumentChanges}; use meilisearch_types::milli::update::{ @@ -1294,19 +1295,30 @@ impl IndexScheduler { _ => None, }) .unwrap(); - let content_file = self.file_store.get_update(*first_addition_uuid)?; - let reader = - DocumentsBatchReader::from_reader(content_file).map_err(milli::Error::from)?; - let (cursor, documents_batch_index) = reader.into_cursor_and_fields_index(); - let primary_key = - guess_primary_key(&rtxn, index, cursor, &documents_batch_index)?.unwrap(); + // let content_file = self.file_store.get_update(*first_addition_uuid)?; + // let reader = + // DocumentsBatchReader::from_reader(content_file).map_err(milli::Error::from)?; + // let (cursor, documents_batch_index) = reader.into_cursor_and_fields_index(); + // let primary_key = + // guess_primary_key(&rtxn, index, cursor, &documents_batch_index)?.unwrap(); + + let mut content_files = Vec::new(); + for operation in &operations { + if let DocumentOperation::Add(content_uuid) = operation { + let content_file = self.file_store.get_update(*content_uuid)?; + let mmap = unsafe { memmap2::Mmap::map(&content_file)? }; + content_files.push(mmap); + } + } + + let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(method); for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) { match operation { - DocumentOperation::Add(content_uuid) => { - let content_file = self.file_store.get_update(content_uuid)?; - let stats = indexer.add_documents(content_file)?; + DocumentOperation::Add(_content_uuid) => { + let mmap = content_files_iter.next().unwrap(); + let stats = indexer.add_documents(&mmap)?; // builder = builder.with_embedders(embedders.clone()); let received_documents = @@ -1357,6 +1369,17 @@ impl IndexScheduler { // let pool = indexer_config.thread_pool.unwrap(); let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); // let fields_ids_map = RwLock::new(fields_ids_map); + + /// TODO correctly guess the primary key in a NDJSON + let pk = match std::env::var("MEILI_PRIMARY_KEY") { + Ok(pk) => pk, + Err(VarError::NotPresent) => "id".to_string(), + Err(e) => panic!("primary key error: {e}"), + }; + + fields_ids_map.insert(&pk); + let primary_key = PrimaryKey::new(&pk, &fields_ids_map).unwrap(); + let param = (index, &rtxn, &primary_key); let document_changes = indexer.document_changes(&mut fields_ids_map, param)?; /// TODO pass/write the FieldsIdsMap diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 85cf33c54..029a125d0 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -459,12 +459,13 @@ async fn document_addition( return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); } - let read_file = buffer.into_inner().into_std().await; + let mut read_file = buffer.into_inner().into_std().await; let documents_count = tokio::task::spawn_blocking(move || { let documents_count = match format { PayloadType::Json => read_json(&read_file, &mut update_file)?, PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?, - PayloadType::Ndjson => read_ndjson(&read_file, &mut update_file)?, + /// TODO do not copy all the content + PayloadType::Ndjson => std::io::copy(&mut read_file, &mut update_file).unwrap(), }; // we NEED to persist the file here because we moved the `udpate_file` in another task. update_file.persist()?; diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index c30665f17..93e051aa2 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -1,31 +1,26 @@ use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; -use std::fs::File; -use std::io::Cursor; use std::sync::Arc; use heed::types::Bytes; use heed::RoTxn; -use memmap2::Mmap; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use super::super::document_change::DocumentChange; use super::super::items_pool::ItemsPool; use super::DocumentChanges; -use crate::documents::{ - obkv_to_object, DocumentIdExtractionError, DocumentsBatchReader, PrimaryKey, -}; +use crate::documents::PrimaryKey; use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; -pub struct DocumentOperation { - operations: Vec, +pub struct DocumentOperation<'pl> { + operations: Vec>, index_documents_method: IndexDocumentsMethod, } -pub enum Payload { - Addition(File), +pub enum Payload<'pl> { + Addition(&'pl [u8]), Deletion(Vec), } @@ -35,36 +30,30 @@ pub struct PayloadStats { } #[derive(Clone)] -enum InnerDocOp { - Addition(DocumentOffset), +enum InnerDocOp<'pl> { + Addition(DocumentOffset<'pl>), Deletion, } /// Represents an offset where a document lives /// in an mmapped grenad reader file. #[derive(Clone)] -pub struct DocumentOffset { - /// The mmapped grenad reader file. - pub content: Arc, // grenad::Reader - /// The offset of the document in the file. - pub offset: u32, +pub struct DocumentOffset<'pl> { + /// The mmapped payload files. + pub content: &'pl [u8], } -impl DocumentOperation { +impl<'pl> DocumentOperation<'pl> { pub fn new(method: IndexDocumentsMethod) -> Self { Self { operations: Default::default(), index_documents_method: method } } /// TODO please give me a type /// The payload is expected to be in the grenad format - pub fn add_documents(&mut self, payload: File) -> Result { - let reader = DocumentsBatchReader::from_reader(&payload)?; - let bytes = payload.metadata()?.len(); - let document_count = reader.documents_count() as usize; - + pub fn add_documents(&mut self, payload: &'pl [u8]) -> Result { + let document_count = memchr::Memchr::new(b'\n', payload).count(); self.operations.push(Payload::Addition(payload)); - - Ok(PayloadStats { bytes, document_count }) + Ok(PayloadStats { bytes: payload.len() as u64, document_count }) } pub fn delete_documents(&mut self, to_delete: Vec) { @@ -72,7 +61,7 @@ impl DocumentOperation { } } -impl<'p> DocumentChanges<'p> for DocumentOperation { +impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { type Parameter = (&'p Index, &'p RoTxn<'p>, &'p PrimaryKey<'p>); fn document_changes( @@ -84,48 +73,63 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { let documents_ids = index.documents_ids(rtxn)?; let mut available_docids = AvailableIds::new(&documents_ids); - let mut docids_version_offsets = HashMap::::new(); + let mut docids_version_offsets = HashMap::, _>::new(); for operation in self.operations { match operation { Payload::Addition(payload) => { - let content = unsafe { Mmap::map(&payload).map(Arc::new)? }; - let cursor = Cursor::new(content.as_ref()); - let reader = DocumentsBatchReader::from_reader(cursor)?; + let mut iter = + serde_json::Deserializer::from_slice(payload).into_iter::(); - let (mut batch_cursor, batch_index) = reader.into_cursor_and_fields_index(); - // TODO Fetch all document fields to fill the fields ids map - batch_index.iter().for_each(|(_, name)| { - fields_ids_map.insert(name); - }); + /// TODO manage the error + let mut previous_offset = 0; + while let Some(document) = iter.next().transpose().unwrap() { + // TODO Fetch all document fields to fill the fields ids map + document.0.keys().for_each(|key| { + fields_ids_map.insert(key.as_ref()); + }); - let mut offset: u32 = 0; - while let Some(document) = batch_cursor.next_document()? { + // TODO we must manage the TooManyDocumentIds,InvalidDocumentId + // we must manage the unwrap let external_document_id = - match primary_key.document_id(document, &batch_index)? { - Ok(document_id) => Ok(document_id), - Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => { - Err(user_error) - } - Err(DocumentIdExtractionError::MissingDocumentId) => { - Err(UserError::MissingDocumentId { + match get_docid(&document, &[primary_key.name()]).unwrap() { + Some(document_id) => document_id, + None => { + return Err(UserError::MissingDocumentId { primary_key: primary_key.name().to_string(), - document: obkv_to_object(document, &batch_index)?, - }) + document: todo!(), + // document: obkv_to_object(document, &batch_index)?, + } + .into()); } - Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { - Err(UserError::TooManyDocumentIds { - primary_key: primary_key.name().to_string(), - document: obkv_to_object(document, &batch_index)?, - }) - } - }?; + }; - let content = content.clone(); - let document_offset = DocumentOffset { content, offset }; - let document_operation = InnerDocOp::Addition(document_offset); + // let external_document_id = + // match primary_key.document_id(document, &batch_index)? { + // Ok(document_id) => Ok(document_id), + // Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => { + // Err(user_error) + // } + // Err(DocumentIdExtractionError::MissingDocumentId) => { + // Err(UserError::MissingDocumentId { + // primary_key: primary_key.name().to_string(), + // document: obkv_to_object(document, &batch_index)?, + // }) + // } + // Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + // Err(UserError::TooManyDocumentIds { + // primary_key: primary_key.name().to_string(), + // document: obkv_to_object(document, &batch_index)?, + // }) + // } + // }?; - match docids_version_offsets.get_mut(&external_document_id) { + let current_offset = iter.byte_offset(); + let document_operation = InnerDocOp::Addition(DocumentOffset { + content: &payload[previous_offset..current_offset], + }); + + match docids_version_offsets.get_mut(external_document_id.as_ref()) { None => { let docid = match index .external_documents_ids() @@ -144,12 +148,13 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { } Some((_, offsets)) => offsets.push(document_operation), } - offset += 1; + + previous_offset = iter.byte_offset(); } } Payload::Deletion(to_delete) => { for external_document_id in to_delete { - match docids_version_offsets.get_mut(&external_document_id) { + match docids_version_offsets.get_mut(external_document_id.as_str()) { None => { let docid = match index .external_documents_ids() @@ -162,7 +167,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { }; docids_version_offsets.insert( - external_document_id, + CowStr(external_document_id.into()), (docid, vec![InnerDocOp::Deletion]), ); } @@ -196,7 +201,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation { index, &fields_ids_map, internal_docid, - external_docid, + external_docid.to_string(), // TODO do not clone &operations, ) }) @@ -221,22 +226,20 @@ fn merge_document_for_replacements( let current: Option<&KvReaderFieldId> = current.map(Into::into); match operations.last() { - Some(InnerDocOp::Addition(DocumentOffset { content, offset })) => { - let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; - let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); - let update = cursor.get(*offset)?.expect("must exists"); - + Some(InnerDocOp::Addition(DocumentOffset { content })) => { + let map: TopLevelMap = serde_json::from_slice(content).unwrap(); let mut document_entries = Vec::new(); - update.into_iter().for_each(|(k, v)| { - let field_name = batch_index.name(k).unwrap(); - let id = fields_ids_map.id(field_name).unwrap(); + for (key, v) in map.0 { + let id = fields_ids_map.id(key.as_ref()).unwrap(); document_entries.push((id, v)); - }); + } document_entries.sort_unstable_by_key(|(id, _)| *id); let mut writer = KvWriterFieldId::memory(); - document_entries.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); + document_entries + .into_iter() + .for_each(|(id, value)| writer.insert(id, value.get()).unwrap()); let new = writer.into_boxed(); match current { @@ -305,22 +308,18 @@ fn merge_document_for_updates( } for operation in operations { - let DocumentOffset { content, offset } = match operation { + let DocumentOffset { content } = match operation { InnerDocOp::Addition(offset) => offset, InnerDocOp::Deletion => { unreachable!("Deletion in document operations") } }; - let reader = DocumentsBatchReader::from_reader(Cursor::new(content.as_ref()))?; - let (mut cursor, batch_index) = reader.into_cursor_and_fields_index(); - let update = cursor.get(*offset)?.expect("must exists"); - - update.into_iter().for_each(|(k, v)| { - let field_name = batch_index.name(k).unwrap(); - let id = fields_ids_map.id(field_name).unwrap(); - document.insert(id, v.to_vec().into()); - }); + let map: TopLevelMap = serde_json::from_slice(content).unwrap(); + for (key, v) in map.0 { + let id = fields_ids_map.id(key.as_ref()).unwrap(); + document.insert(id, v.get().as_bytes().to_vec().into()); + } } let mut writer = KvWriterFieldId::memory(); @@ -348,9 +347,21 @@ use serde_json::value::RawValue; #[derive(Deserialize)] pub struct TopLevelMap<'p>(#[serde(borrow)] BTreeMap, &'p RawValue>); -#[derive(Deserialize, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] pub struct CowStr<'p>(#[serde(borrow)] Cow<'p, str>); +impl CowStr<'_> { + fn to_string(&self) -> String { + self.0.to_string() + } +} + +impl AsRef for CowStr<'_> { + fn as_ref(&self) -> &str { + self.0.as_ref() + } +} + impl<'doc> Borrow for CowStr<'doc> { fn borrow(&self) -> &str { self.0.borrow() From 8fd0afaaaa97cbce6ed108d2d2a289206a4ef864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 5 Sep 2024 22:31:17 +0200 Subject: [PATCH 049/247] Make sure we iterate over the payload documents in order --- index-scheduler/src/batch.rs | 2 +- .../update/new/indexer/document_operation.rs | 21 +++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index ba99eb418..506ba6581 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -1318,7 +1318,7 @@ impl IndexScheduler { match operation { DocumentOperation::Add(_content_uuid) => { let mmap = content_files_iter.next().unwrap(); - let stats = indexer.add_documents(&mmap)?; + let stats = indexer.add_documents(mmap)?; // builder = builder.with_embedders(embedders.clone()); let received_documents = diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 93e051aa2..3cbaf836d 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use heed::types::Bytes; use heed::RoTxn; +use memmap2::Mmap; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use super::super::document_change::DocumentChange; @@ -50,9 +51,10 @@ impl<'pl> DocumentOperation<'pl> { /// TODO please give me a type /// The payload is expected to be in the grenad format - pub fn add_documents(&mut self, payload: &'pl [u8]) -> Result { - let document_count = memchr::Memchr::new(b'\n', payload).count(); - self.operations.push(Payload::Addition(payload)); + pub fn add_documents(&mut self, payload: &'pl Mmap) -> Result { + payload.advise(memmap2::Advice::Sequential)?; + let document_count = memchr::Memchr::new(b'\n', &payload[..]).count(); + self.operations.push(Payload::Addition(&payload[..])); Ok(PayloadStats { bytes: payload.len() as u64, document_count }) } @@ -181,7 +183,18 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { /// TODO is it the best way to provide FieldsIdsMap to the parallel iterator? let fields_ids_map = fields_ids_map.clone(); // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone - let docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect(); + let mut docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect(); + // Reorder the offsets to make sure we iterate on the file sequentially + docids_version_offsets.sort_unstable_by_key(|(_, (_, offsets))| { + offsets + .iter() + .rev() + .find_map(|ido| match ido { + InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), + InnerDocOp::Deletion => None, + }) + .unwrap_or(0) + }); Ok(docids_version_offsets .into_par_iter() From f69688e8f7f23bb524cf1f4a9ff3e0ce1f21fb24 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 9 Sep 2024 14:52:50 +0200 Subject: [PATCH 050/247] Fix several warnings in extractors and remove unreachable macros --- milli/src/update/new/extract/faceted/mod.rs | 4 +- .../extract_fid_word_count_docids.rs | 19 +--- .../extract/searchable/extract_word_docids.rs | 94 +++++++++++++++++-- .../extract_word_pair_proximity_docids.rs | 15 +-- .../src/update/new/extract/searchable/mod.rs | 55 +---------- 5 files changed, 100 insertions(+), 87 deletions(-) diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index 62aa7adb2..b4d6b4131 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -87,11 +87,11 @@ pub trait FacetedExtractor { where MF: MergeFunction, MF::Error: Debug, + grenad::Error: Into, { buffer.clear(); match Self::build_key(fid, value, buffer) { - // TODO manage errors - Some(key) => Ok(cache_fn(cached_sorter, &key, docid).unwrap()), + Some(key) => cache_fn(cached_sorter, &key, docid).map_err(Into::into), None => Ok(()), } } diff --git a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs index 7cb11c11b..4d90b46d4 100644 --- a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::collections::HashMap; use heed::RoTxn; @@ -25,12 +24,6 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { Ok(vec![]) } - /// This case is unreachable because extract_document_change has been reimplemented to not call this function. - fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> { - /// TODO remove this - unreachable!() - } - // This method is reimplemented to count the number of words in the document in each field // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS. fn extract_document_change( @@ -59,8 +52,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { for (fid, count) in fid_word_count.iter() { if *count <= MAX_COUNTED_WORDS { let key = build_key(*fid, *count as u8, &mut key_buffer); - /// TODO manage the error - cached_sorter.insert_del_u32(key, inner.docid()).unwrap(); + cached_sorter.insert_del_u32(key, inner.docid())?; } } } @@ -93,13 +85,11 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { if *current_count != *new_count { if *current_count <= MAX_COUNTED_WORDS { let key = build_key(*fid, *current_count as u8, &mut key_buffer); - /// TODO manage the error - cached_sorter.insert_del_u32(key, inner.docid()).unwrap(); + cached_sorter.insert_del_u32(key, inner.docid())?; } if *new_count <= MAX_COUNTED_WORDS { let key = build_key(*fid, *new_count as u8, &mut key_buffer); - /// TODO manage the error - cached_sorter.insert_add_u32(key, inner.docid()).unwrap(); + cached_sorter.insert_add_u32(key, inner.docid())?; } } } @@ -116,8 +106,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { for (fid, count) in fid_word_count.iter() { if *count <= MAX_COUNTED_WORDS { let key = build_key(*fid, *count as u8, &mut key_buffer); - /// TODO manage the error - cached_sorter.insert_add_u32(key, inner.docid()).unwrap(); + cached_sorter.insert_add_u32(key, inner.docid())?; } } } diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index db8bb7993..0cf36cf00 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -2,11 +2,93 @@ use std::borrow::Cow; use heed::RoTxn; -use super::SearchableExtractor; -use crate::{bucketed_position, FieldId, Index, Result}; +use super::{tokenize_document::DocumentTokenizer, SearchableExtractor}; +use crate::{ + bucketed_position, + update::{ + new::{extract::cache::CboCachedSorter, DocumentChange}, + MergeDeladdCboRoaringBitmaps, + }, + FieldId, GlobalFieldsIdsMap, Index, Result, +}; + +trait ProtoWordDocidsExtractor { + fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>; + fn attributes_to_extract<'a>( + _rtxn: &'a RoTxn, + _index: &'a Index, + ) -> Result>>; + + fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; +} + +impl SearchableExtractor for T +where + T: ProtoWordDocidsExtractor, +{ + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + document_tokenizer: &DocumentTokenizer, + fields_ids_map: &mut GlobalFieldsIdsMap, + cached_sorter: &mut CboCachedSorter, + document_change: DocumentChange, + ) -> Result<()> { + match document_change { + DocumentChange::Deletion(inner) => { + let mut token_fn = |fid, pos: u16, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut token_fn, + )?; + } + DocumentChange::Update(inner) => { + let mut token_fn = |fid, pos, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut token_fn, + )?; + + let mut token_fn = |fid, pos, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + } + DocumentChange::Insertion(inner) => { + let mut token_fn = |fid, pos, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + } + } + + Ok(()) + } + + fn attributes_to_extract<'a>( + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + Self::attributes_to_extract(rtxn, index) + } + + fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + Self::attributes_to_skip(rtxn, index) + } +} pub struct WordDocidsExtractor; -impl SearchableExtractor for WordDocidsExtractor { +impl ProtoWordDocidsExtractor for WordDocidsExtractor { fn attributes_to_extract<'a>( rtxn: &'a RoTxn, index: &'a Index, @@ -26,7 +108,7 @@ impl SearchableExtractor for WordDocidsExtractor { } pub struct ExactWordDocidsExtractor; -impl SearchableExtractor for ExactWordDocidsExtractor { +impl ProtoWordDocidsExtractor for ExactWordDocidsExtractor { fn attributes_to_extract<'a>( rtxn: &'a RoTxn, index: &'a Index, @@ -55,7 +137,7 @@ impl SearchableExtractor for ExactWordDocidsExtractor { } pub struct WordFidDocidsExtractor; -impl SearchableExtractor for WordFidDocidsExtractor { +impl ProtoWordDocidsExtractor for WordFidDocidsExtractor { fn attributes_to_extract<'a>( rtxn: &'a RoTxn, index: &'a Index, @@ -77,7 +159,7 @@ impl SearchableExtractor for WordFidDocidsExtractor { } pub struct WordPositionDocidsExtractor; -impl SearchableExtractor for WordPositionDocidsExtractor { +impl ProtoWordDocidsExtractor for WordPositionDocidsExtractor { fn attributes_to_extract<'a>( rtxn: &'a RoTxn, index: &'a Index, diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index e9de6e9f2..dbd08901b 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::collections::{BTreeMap, VecDeque}; use heed::RoTxn; @@ -26,12 +25,6 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { Ok(vec![]) } - /// This case is unreachable because extract_document_change has been reimplemented to not call this function. - fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> { - /// TODO remove this - unreachable!() - } - // This method is reimplemented to count the number of words in the document in each field // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS. fn extract_document_change( @@ -100,18 +93,18 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { match eob { Left(((w1, w2), prox)) => { let key = build_key(*prox, w1, w2, &mut key_buffer); - cached_sorter.insert_del_u32(key, docid).unwrap(); + cached_sorter.insert_del_u32(key, docid)?; } Right(((w1, w2), prox)) => { let key = build_key(*prox, w1, w2, &mut key_buffer); - cached_sorter.insert_add_u32(key, docid).unwrap(); + cached_sorter.insert_add_u32(key, docid)?; } Both(((w1, w2), del_prox), (_, add_prox)) => { if del_prox != add_prox { let key = build_key(*del_prox, w1, w2, &mut key_buffer); - cached_sorter.insert_del_u32(key, docid).unwrap(); + cached_sorter.insert_del_u32(key, docid)?; let key = build_key(*add_prox, w1, w2, &mut key_buffer); - cached_sorter.insert_add_u32(key, docid).unwrap(); + cached_sorter.insert_add_u32(key, docid)?; } } }; diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index a7498d0d9..c3ac30b17 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -3,7 +3,6 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod tokenize_document; -use std::borrow::Cow; use std::fs::File; pub use extract_fid_word_count_docids::FidWordCountDocidsExtractor; @@ -20,7 +19,7 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; pub trait SearchableExtractor { fn run_extraction( @@ -109,60 +108,10 @@ pub trait SearchableExtractor { fields_ids_map: &mut GlobalFieldsIdsMap, cached_sorter: &mut CboCachedSorter, document_change: DocumentChange, - ) -> Result<()> { - match document_change { - DocumentChange::Deletion(inner) => { - let mut token_fn = |fid, pos: u16, word: &str| { - let key = Self::build_key(fid, pos, word); - /// TODO manage the error - cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); - Ok(()) - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - } - DocumentChange::Update(inner) => { - let mut token_fn = |fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - /// TODO manage the error - cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); - Ok(()) - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - - let mut token_fn = |fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - /// TODO manage the error - cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); - Ok(()) - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - } - DocumentChange::Insertion(inner) => { - let mut token_fn = |fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - /// TODO manage the error - cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); - Ok(()) - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - } - } - - Ok(()) - } + ) -> Result<()>; fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>>; fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; - - fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>; } From 8d97b7b28cbab9444e023b5e3cf822151a8b0127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 10 Sep 2024 17:09:49 +0100 Subject: [PATCH 051/247] Support JSON payloads again (not perfectly though) --- meilisearch-types/src/document_formats.rs | 91 +++++++++++---------- meilisearch/src/routes/indexes/documents.rs | 19 +++-- 2 files changed, 60 insertions(+), 50 deletions(-) diff --git a/meilisearch-types/src/document_formats.rs b/meilisearch-types/src/document_formats.rs index 50dc5bad4..942203b68 100644 --- a/meilisearch-types/src/document_formats.rs +++ b/meilisearch-types/src/document_formats.rs @@ -1,10 +1,9 @@ use std::fmt::{self, Debug, Display}; use std::fs::File; -use std::io::{self, BufWriter, Write}; +use std::io::{self, BufReader, BufWriter, Write}; use std::marker::PhantomData; -use memmap2::MmapOptions; -use milli::documents::{DocumentsBatchBuilder, Error}; +use milli::documents::Error; use milli::Object; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; @@ -104,29 +103,35 @@ impl ErrorCode for DocumentFormatError { } /// Reads CSV from input and write an obkv batch to writer. -pub fn read_csv(file: &File, writer: impl Write, delimiter: u8) -> Result { - let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer)); - let mmap = unsafe { MmapOptions::new().map(file)? }; - let csv = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(mmap.as_ref()); - builder.append_csv(csv).map_err(|e| (PayloadType::Csv { delimiter }, e))?; +pub fn read_csv( + _input: BufReader, + _output: &mut BufWriter, + _delimiter: u8, +) -> Result { + todo!() + // let mut builder = DocumentsBatchBuilder::new(BufWriter::new(output)); + // let mmap = unsafe { MmapOptions::new().map(input)? }; + // let csv = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(mmap.as_ref()); + // builder.append_csv(csv).map_err(|e| (PayloadType::Csv { delimiter }, e))?; - let count = builder.documents_count(); - let _ = builder.into_inner().map_err(DocumentFormatError::Io)?; + // let count = builder.documents_count(); + // let _ = builder.into_inner().map_err(DocumentFormatError::Io)?; - Ok(count as u64) + // Ok(count as u64) } /// Reads JSON from temporary file and write an obkv batch to writer. -pub fn read_json(file: &File, writer: impl Write) -> Result { - let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer)); - let mmap = unsafe { MmapOptions::new().map(file)? }; - let mut deserializer = serde_json::Deserializer::from_slice(&mmap); - - match array_each(&mut deserializer, |obj| builder.append_json_object(&obj)) { +pub fn read_json(input: BufReader, mut output: &mut BufWriter) -> Result { + let mut count = 0; + let mut deserializer = serde_json::Deserializer::from_reader(input); + match array_each(&mut deserializer, |obj: Object| { + count += 1; + serde_json::to_writer(&mut output, &obj) + }) { // The json data has been deserialized and does not need to be processed again. // The data has been transferred to the writer during the deserialization process. Ok(Ok(_)) => (), - Ok(Err(e)) => return Err(DocumentFormatError::Io(e)), + Ok(Err(e)) => return Err(DocumentFormatError::Io(e.into())), Err(e) => { // Attempt to deserialize a single json string when the cause of the exception is not Category.data // Other types of deserialisation exceptions are returned directly to the front-end @@ -137,33 +142,30 @@ pub fn read_json(file: &File, writer: impl Write) -> Result { )); } - let content: Object = serde_json::from_slice(&mmap) - .map_err(Error::Json) - .map_err(|e| (PayloadType::Json, e))?; - builder.append_json_object(&content).map_err(DocumentFormatError::Io)?; + todo!("single document/object update") + + // let content: Object = serde_json::from_slice(&mmap) + // .map_err(Error::Json) + // .map_err(|e| (PayloadType::Json, e))?; + // serde_json::to_writer(&mut output, &content).unwrap() } } - let count = builder.documents_count(); - let _ = builder.into_inner().map_err(DocumentFormatError::Io)?; - - Ok(count as u64) + Ok(count) } -/// Reads JSON from temporary file and write an obkv batch to writer. -pub fn read_ndjson(file: &File, writer: impl Write) -> Result { - let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer)); - let mmap = unsafe { MmapOptions::new().map(file)? }; - - for result in serde_json::Deserializer::from_slice(&mmap).into_iter() { - let object = result.map_err(Error::Json).map_err(|e| (PayloadType::Ndjson, e))?; - builder.append_json_object(&object).map_err(Into::into).map_err(DocumentFormatError::Io)?; +/// Reads JSON from temporary file and write it into the writer. +pub fn read_ndjson(input: BufReader, mut output: &mut BufWriter) -> Result { + let mut count = 0; + for result in serde_json::Deserializer::from_reader(input).into_iter() { + count += 1; + // TODO Correctly manage the errors + // Avoid copying the content: use CowStr from milli (move it elsewhere) + let map: Object = result.unwrap(); + serde_json::to_writer(&mut output, &map).unwrap(); } - let count = builder.documents_count(); - let _ = builder.into_inner().map_err(Into::into).map_err(DocumentFormatError::Io)?; - - Ok(count as u64) + Ok(count) } /// The actual handling of the deserialization process in serde @@ -172,20 +174,23 @@ pub fn read_ndjson(file: &File, writer: impl Write) -> Result { /// ## References /// /// -fn array_each<'de, D, T, F>(deserializer: D, f: F) -> std::result::Result, D::Error> +fn array_each<'de, D, T, F>( + deserializer: D, + f: F, +) -> std::result::Result, D::Error> where D: Deserializer<'de>, T: Deserialize<'de>, - F: FnMut(T) -> io::Result<()>, + F: FnMut(T) -> serde_json::Result<()>, { struct SeqVisitor(F, PhantomData); impl<'de, T, F> Visitor<'de> for SeqVisitor where T: Deserialize<'de>, - F: FnMut(T) -> io::Result<()>, + F: FnMut(T) -> serde_json::Result<()>, { - type Value = io::Result; + type Value = serde_json::Result; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { formatter.write_str("a nonempty sequence") @@ -194,7 +199,7 @@ where fn visit_seq( mut self, mut seq: A, - ) -> std::result::Result, >::Error> + ) -> std::result::Result, >::Error> where A: SeqAccess<'de>, { diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 029a125d0..87b448051 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -1,4 +1,4 @@ -use std::io::ErrorKind; +use std::io::{BufReader, ErrorKind}; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; @@ -423,7 +423,7 @@ async fn document_addition( } }; - let (uuid, mut update_file) = index_scheduler.create_update_file(dry_run)?; + let (uuid, update_file) = index_scheduler.create_update_file(dry_run)?; let temp_file = match tempfile() { Ok(file) => file, @@ -459,15 +459,20 @@ async fn document_addition( return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); } - let mut read_file = buffer.into_inner().into_std().await; + let read_file = BufReader::new(buffer.into_inner().into_std().await); let documents_count = tokio::task::spawn_blocking(move || { + let mut update_file = std::io::BufWriter::new(update_file); let documents_count = match format { - PayloadType::Json => read_json(&read_file, &mut update_file)?, - PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?, - /// TODO do not copy all the content - PayloadType::Ndjson => std::io::copy(&mut read_file, &mut update_file).unwrap(), + PayloadType::Json => read_json(read_file, &mut update_file)?, + PayloadType::Csv { delimiter } => read_csv(read_file, &mut update_file, delimiter)?, + PayloadType::Ndjson => read_ndjson(read_file, &mut update_file)?, }; // we NEED to persist the file here because we moved the `udpate_file` in another task. + // TODO better support of errors + let update_file = match update_file.into_inner() { + Ok(update_file) => update_file, + Err(_) => todo!("handle errors"), + }; update_file.persist()?; Ok(documents_count) }) From 24cb5839adf23bff7386e86d12ea55c2ae7e4820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 10 Sep 2024 17:37:52 +0100 Subject: [PATCH 052/247] Move the document changes sorting logic to a new trait --- .../update/new/indexer/document_operation.rs | 282 ++++++++++-------- 1 file changed, 163 insertions(+), 119 deletions(-) diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 3cbaf836d..799079b0a 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -6,6 +6,7 @@ use heed::types::Bytes; use heed::RoTxn; use memmap2::Mmap; use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use IndexDocumentsMethod as Idm; use super::super::document_change::DocumentChange; use super::super::items_pool::ItemsPool; @@ -148,6 +149,8 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { (docid, vec![document_operation]), ); } + // TODO clean the code to make sure we clean the useless operations + // add a method to the MergeChanges trait Some((_, offsets)) => offsets.push(document_operation), } @@ -185,16 +188,10 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone let mut docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect(); // Reorder the offsets to make sure we iterate on the file sequentially - docids_version_offsets.sort_unstable_by_key(|(_, (_, offsets))| { - offsets - .iter() - .rev() - .find_map(|ido| match ido { - InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), - InnerDocOp::Deletion => None, - }) - .unwrap_or(0) - }); + match self.index_documents_method { + Idm::ReplaceDocuments => MergeDocumentForReplacement::sort(&mut docids_version_offsets), + Idm::UpdateDocuments => MergeDocumentForUpdates::sort(&mut docids_version_offsets), + } Ok(docids_version_offsets .into_par_iter() @@ -202,11 +199,9 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))), move |context_pool, (external_docid, (internal_docid, operations))| { context_pool.with(|rtxn| { - use IndexDocumentsMethod as Idm; - let document_merge_function = match self.index_documents_method { - Idm::ReplaceDocuments => merge_document_for_replacements, - Idm::UpdateDocuments => merge_document_for_updates, + Idm::ReplaceDocuments => MergeDocumentForReplacement::merge, + Idm::UpdateDocuments => MergeDocumentForUpdates::merge, }; document_merge_function( @@ -224,129 +219,178 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { } } -/// Returns only the most recent version of a document based on the updates from the payloads. -/// -/// This function is only meant to be used when doing a replacement and not an update. -fn merge_document_for_replacements( - rtxn: &RoTxn, - index: &Index, - fields_ids_map: &FieldsIdsMap, - docid: DocumentId, - external_docid: String, - operations: &[InnerDocOp], -) -> Result> { - let current = index.documents.remap_data_type::().get(rtxn, &docid)?; - let current: Option<&KvReaderFieldId> = current.map(Into::into); +trait MergeChanges { + /// Reorders the offsets to make sure we iterate on the file sequentially. + fn sort(changes_offsets: &mut [(CowStr, (DocumentId, Vec))]); - match operations.last() { - Some(InnerDocOp::Addition(DocumentOffset { content })) => { - let map: TopLevelMap = serde_json::from_slice(content).unwrap(); - let mut document_entries = Vec::new(); - for (key, v) in map.0 { - let id = fields_ids_map.id(key.as_ref()).unwrap(); - document_entries.push((id, v)); + fn merge( + rtxn: &RoTxn, + index: &Index, + fields_ids_map: &FieldsIdsMap, + docid: DocumentId, + external_docid: String, + operations: &[InnerDocOp], + ) -> Result>; +} + +struct MergeDocumentForReplacement; + +impl MergeChanges for MergeDocumentForReplacement { + /// Reorders to read only the last change. + fn sort(changes_offsets: &mut [(CowStr, (DocumentId, Vec))]) { + changes_offsets.sort_unstable_by_key(|(_, (_, offsets))| { + offsets + .iter() + .rev() + .find_map(|ido| match ido { + InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), + InnerDocOp::Deletion => None, + }) + .unwrap_or(0) + }); + } + + /// Returns only the most recent version of a document based on the updates from the payloads. + /// + /// This function is only meant to be used when doing a replacement and not an update. + fn merge( + rtxn: &RoTxn, + index: &Index, + fields_ids_map: &FieldsIdsMap, + docid: DocumentId, + external_docid: String, + operations: &[InnerDocOp], + ) -> Result> { + let current = index.documents.remap_data_type::().get(rtxn, &docid)?; + let current: Option<&KvReaderFieldId> = current.map(Into::into); + + match operations.last() { + Some(InnerDocOp::Addition(DocumentOffset { content })) => { + let map: TopLevelMap = serde_json::from_slice(content).unwrap(); + let mut document_entries = Vec::new(); + for (key, v) in map.0 { + let id = fields_ids_map.id(key.as_ref()).unwrap(); + document_entries.push((id, v)); + } + + document_entries.sort_unstable_by_key(|(id, _)| *id); + + let mut writer = KvWriterFieldId::memory(); + document_entries + .into_iter() + .for_each(|(id, value)| writer.insert(id, value.get()).unwrap()); + let new = writer.into_boxed(); + + match current { + Some(current) => { + let update = Update::create(docid, external_docid, current.boxed(), new); + Ok(Some(DocumentChange::Update(update))) + } + None => { + let insertion = Insertion::create(docid, external_docid, new); + Ok(Some(DocumentChange::Insertion(insertion))) + } + } } - - document_entries.sort_unstable_by_key(|(id, _)| *id); - - let mut writer = KvWriterFieldId::memory(); - document_entries - .into_iter() - .for_each(|(id, value)| writer.insert(id, value.get()).unwrap()); - let new = writer.into_boxed(); - - match current { + Some(InnerDocOp::Deletion) => match current { Some(current) => { - let update = Update::create(docid, external_docid, current.boxed(), new); - Ok(Some(DocumentChange::Update(update))) + let deletion = Deletion::create(docid, external_docid, current.boxed()); + Ok(Some(DocumentChange::Deletion(deletion))) } - None => { - let insertion = Insertion::create(docid, external_docid, new); - Ok(Some(DocumentChange::Insertion(insertion))) - } - } + None => Ok(None), + }, + None => Ok(None), // but it's strange } - Some(InnerDocOp::Deletion) => match current { - Some(current) => { - let deletion = Deletion::create(docid, external_docid, current.boxed()); - Ok(Some(DocumentChange::Deletion(deletion))) - } - None => Ok(None), - }, - None => Ok(None), // but it's strange } } -/// Reads the previous version of a document from the database, the new versions -/// in the grenad update files and merges them to generate a new boxed obkv. -/// -/// This function is only meant to be used when doing an update and not a replacement. -fn merge_document_for_updates( - rtxn: &RoTxn, - index: &Index, - fields_ids_map: &FieldsIdsMap, - docid: DocumentId, - external_docid: String, - operations: &[InnerDocOp], -) -> Result> { - let mut document = BTreeMap::<_, Cow<_>>::new(); - let current = index.documents.remap_data_type::().get(rtxn, &docid)?; - let current: Option<&KvReaderFieldId> = current.map(Into::into); +struct MergeDocumentForUpdates; - if operations.is_empty() { - return Ok(None); // but it's strange +impl MergeChanges for MergeDocumentForUpdates { + /// Reorders to read the first changes first so that it's faster to read the first one and then the rest. + fn sort(changes_offsets: &mut [(CowStr, (DocumentId, Vec))]) { + changes_offsets.sort_unstable_by_key(|(_, (_, offsets))| { + offsets + .iter() + .find_map(|ido| match ido { + InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), + InnerDocOp::Deletion => None, + }) + .unwrap_or(0) + }); } - let last_deletion = operations.iter().rposition(|op| matches!(op, InnerDocOp::Deletion)); - let operations = &operations[last_deletion.map_or(0, |i| i + 1)..]; + /// Reads the previous version of a document from the database, the new versions + /// in the grenad update files and merges them to generate a new boxed obkv. + /// + /// This function is only meant to be used when doing an update and not a replacement. + fn merge( + rtxn: &RoTxn, + index: &Index, + fields_ids_map: &FieldsIdsMap, + docid: DocumentId, + external_docid: String, + operations: &[InnerDocOp], + ) -> Result> { + let mut document = BTreeMap::<_, Cow<_>>::new(); + let current = index.documents.remap_data_type::().get(rtxn, &docid)?; + let current: Option<&KvReaderFieldId> = current.map(Into::into); - // If there was a deletion we must not start - // from the original document but from scratch. - if last_deletion.is_none() { - if let Some(current) = current { - current.into_iter().for_each(|(k, v)| { - document.insert(k, v.into()); - }); + if operations.is_empty() { + return Ok(None); // but it's strange } - } - if operations.is_empty() { + let last_deletion = operations.iter().rposition(|op| matches!(op, InnerDocOp::Deletion)); + let operations = &operations[last_deletion.map_or(0, |i| i + 1)..]; + + // If there was a deletion we must not start + // from the original document but from scratch. + if last_deletion.is_none() { + if let Some(current) = current { + current.into_iter().for_each(|(k, v)| { + document.insert(k, v.into()); + }); + } + } + + if operations.is_empty() { + match current { + Some(current) => { + let deletion = Deletion::create(docid, external_docid, current.boxed()); + return Ok(Some(DocumentChange::Deletion(deletion))); + } + None => return Ok(None), + } + } + + for operation in operations { + let DocumentOffset { content } = match operation { + InnerDocOp::Addition(offset) => offset, + InnerDocOp::Deletion => { + unreachable!("Deletion in document operations") + } + }; + + let map: TopLevelMap = serde_json::from_slice(content).unwrap(); + for (key, v) in map.0 { + let id = fields_ids_map.id(key.as_ref()).unwrap(); + document.insert(id, v.get().as_bytes().to_vec().into()); + } + } + + let mut writer = KvWriterFieldId::memory(); + document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); + let new = writer.into_boxed(); + match current { Some(current) => { - let deletion = Deletion::create(docid, external_docid, current.boxed()); - return Ok(Some(DocumentChange::Deletion(deletion))); + let update = Update::create(docid, external_docid, current.boxed(), new); + Ok(Some(DocumentChange::Update(update))) } - None => return Ok(None), - } - } - - for operation in operations { - let DocumentOffset { content } = match operation { - InnerDocOp::Addition(offset) => offset, - InnerDocOp::Deletion => { - unreachable!("Deletion in document operations") + None => { + let insertion = Insertion::create(docid, external_docid, new); + Ok(Some(DocumentChange::Insertion(insertion))) } - }; - - let map: TopLevelMap = serde_json::from_slice(content).unwrap(); - for (key, v) in map.0 { - let id = fields_ids_map.id(key.as_ref()).unwrap(); - document.insert(id, v.get().as_bytes().to_vec().into()); - } - } - - let mut writer = KvWriterFieldId::memory(); - document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); - let new = writer.into_boxed(); - - match current { - Some(current) => { - let update = Update::create(docid, external_docid, current.boxed(), new); - Ok(Some(DocumentChange::Update(update))) - } - None => { - let insertion = Insertion::create(docid, external_docid, new); - Ok(Some(DocumentChange::Insertion(insertion))) } } } From 04596f36169fb427b4ac8b2279a7ea6b61c7c134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 10 Sep 2024 18:01:17 +0100 Subject: [PATCH 053/247] Move the TopLevelMap into a dedicated module --- .../update/new/indexer/document_operation.rs | 37 +++---------------- milli/src/update/new/indexer/mod.rs | 3 +- milli/src/update/new/indexer/top_level_map.rs | 30 +++++++++++++++ 3 files changed, 38 insertions(+), 32 deletions(-) create mode 100644 milli/src/update/new/indexer/top_level_map.rs diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 799079b0a..0521d43f9 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -1,16 +1,19 @@ use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; +use std::fmt; use std::sync::Arc; use heed::types::Bytes; use heed::RoTxn; use memmap2::Mmap; use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use serde_json::from_str; use IndexDocumentsMethod as Idm; use super::super::document_change::DocumentChange; use super::super::items_pool::ItemsPool; -use super::DocumentChanges; +use super::top_level_map::{CowStr, TopLevelMap}; +use super::{top_level_map, DocumentChanges}; use crate::documents::PrimaryKey; use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; @@ -395,36 +398,8 @@ impl MergeChanges for MergeDocumentForUpdates { } } -use std::borrow::Borrow; - -use serde::Deserialize; -use serde_json::from_str; -use serde_json::value::RawValue; - -#[derive(Deserialize)] -pub struct TopLevelMap<'p>(#[serde(borrow)] BTreeMap, &'p RawValue>); - -#[derive(Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] -pub struct CowStr<'p>(#[serde(borrow)] Cow<'p, str>); - -impl CowStr<'_> { - fn to_string(&self) -> String { - self.0.to_string() - } -} - -impl AsRef for CowStr<'_> { - fn as_ref(&self) -> &str { - self.0.as_ref() - } -} - -impl<'doc> Borrow for CowStr<'doc> { - fn borrow(&self) -> &str { - self.0.borrow() - } -} - +/// Returns the document ID based on the primary and +/// search for it recursively in zero-copy-deserialized documents. fn get_docid<'p>( map: &TopLevelMap<'p>, primary_key: &[&str], diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 5e3104de8..4d7e2aa47 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -26,6 +26,7 @@ use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; mod document_deletion; mod document_operation; mod partial_dump; +mod top_level_map; mod update_by_function; pub trait DocumentChanges<'p> { @@ -121,7 +122,7 @@ where &extractor_sender, )?; } - + { let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids"); let _entered = span.enter(); diff --git a/milli/src/update/new/indexer/top_level_map.rs b/milli/src/update/new/indexer/top_level_map.rs new file mode 100644 index 000000000..9e1481b5e --- /dev/null +++ b/milli/src/update/new/indexer/top_level_map.rs @@ -0,0 +1,30 @@ +use std::borrow::{Borrow, Cow}; +use std::collections::BTreeMap; +use std::fmt; + +use serde::Deserialize; +use serde_json::value::RawValue; + +#[derive(Deserialize)] +pub struct TopLevelMap<'p>(#[serde(borrow)] pub BTreeMap, &'p RawValue>); + +#[derive(Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] +pub struct CowStr<'p>(#[serde(borrow)] pub Cow<'p, str>); + +impl fmt::Display for CowStr<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } +} + +impl AsRef for CowStr<'_> { + fn as_ref(&self) -> &str { + self.0.as_ref() + } +} + +impl<'doc> Borrow for CowStr<'doc> { + fn borrow(&self) -> &str { + self.0.borrow() + } +} From c1c44a0b81c4ee422141f2e67034371565943d22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 10 Sep 2024 19:32:03 +0100 Subject: [PATCH 054/247] Impl serialize on TopLevelMap --- milli/src/update/new/indexer/top_level_map.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/update/new/indexer/top_level_map.rs b/milli/src/update/new/indexer/top_level_map.rs index 9e1481b5e..d82e42dca 100644 --- a/milli/src/update/new/indexer/top_level_map.rs +++ b/milli/src/update/new/indexer/top_level_map.rs @@ -2,13 +2,13 @@ use std::borrow::{Borrow, Cow}; use std::collections::BTreeMap; use std::fmt; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use serde_json::value::RawValue; -#[derive(Deserialize)] +#[derive(Deserialize, Serialize)] pub struct TopLevelMap<'p>(#[serde(borrow)] pub BTreeMap, &'p RawValue>); -#[derive(Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] +#[derive(Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] pub struct CowStr<'p>(#[serde(borrow)] pub Cow<'p, str>); impl fmt::Display for CowStr<'_> { From 8287c2644fdf850957cd5f190705c966359e4bdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 10 Sep 2024 21:10:28 +0100 Subject: [PATCH 055/247] Support CSV again --- meilisearch-types/src/document_formats.rs | 177 ++++++++++++++++---- meilisearch/src/routes/indexes/documents.rs | 16 +- milli/src/update/new/indexer/mod.rs | 1 + milli/src/update/new/mod.rs | 1 + 4 files changed, 147 insertions(+), 48 deletions(-) diff --git a/meilisearch-types/src/document_formats.rs b/meilisearch-types/src/document_formats.rs index 942203b68..0b78e4a94 100644 --- a/meilisearch-types/src/document_formats.rs +++ b/meilisearch-types/src/document_formats.rs @@ -1,14 +1,18 @@ use std::fmt::{self, Debug, Display}; use std::fs::File; -use std::io::{self, BufReader, BufWriter, Write}; +use std::io::{self, BufReader, BufWriter, Seek, Write}; use std::marker::PhantomData; +use csv::StringRecord; +use memmap2::Mmap; use milli::documents::Error; +use milli::update::new::TopLevelMap; use milli::Object; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; use serde_json::error::Category; +use crate::error::deserr_codes::MalformedPayload; use crate::error::{Code, ErrorCode}; type Result = std::result::Result; @@ -87,6 +91,16 @@ impl From<(PayloadType, Error)> for DocumentFormatError { } } +impl From<(PayloadType, serde_json::Error)> for DocumentFormatError { + fn from((ty, error): (PayloadType, serde_json::Error)) -> Self { + if error.classify() == Category::Data { + Self::Io(error.into()) + } else { + Self::MalformedPayload(Error::Json(error), ty) + } + } +} + impl From for DocumentFormatError { fn from(error: io::Error) -> Self { Self::Io(error) @@ -102,67 +116,156 @@ impl ErrorCode for DocumentFormatError { } } +// TODO remove that from the place I've borrowed it +#[derive(Debug)] +enum AllowedType { + String, + Boolean, + Number, +} + +fn parse_csv_header(header: &str) -> (&str, AllowedType) { + // if there are several separators we only split on the last one. + match header.rsplit_once(':') { + Some((field_name, field_type)) => match field_type { + "string" => (field_name, AllowedType::String), + "boolean" => (field_name, AllowedType::Boolean), + "number" => (field_name, AllowedType::Number), + // if the pattern isn't recognized, we keep the whole field. + _otherwise => (header, AllowedType::String), + }, + None => (header, AllowedType::String), + } +} + /// Reads CSV from input and write an obkv batch to writer. -pub fn read_csv( - _input: BufReader, - _output: &mut BufWriter, - _delimiter: u8, -) -> Result { - todo!() - // let mut builder = DocumentsBatchBuilder::new(BufWriter::new(output)); - // let mmap = unsafe { MmapOptions::new().map(input)? }; - // let csv = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(mmap.as_ref()); - // builder.append_csv(csv).map_err(|e| (PayloadType::Csv { delimiter }, e))?; +pub fn read_csv(input: &File, output: impl io::Write, delimiter: u8) -> Result { + use serde_json::{Map, Value}; - // let count = builder.documents_count(); - // let _ = builder.into_inner().map_err(DocumentFormatError::Io)?; + let mut output = BufWriter::new(output); + let mut reader = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(input); - // Ok(count as u64) + // TODO manage error correctly + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. + let mut typed_fields: Vec<_> = reader + .headers() + .unwrap() + .into_iter() + .map(parse_csv_header) + .map(|(f, t)| (f.to_string(), t)) + .collect(); + + let mut object: Map<_, _> = + reader.headers().unwrap().iter().map(|k| (k.to_string(), Value::Null)).collect(); + + let mut line: usize = 0; + let mut record = csv::StringRecord::new(); + while reader.read_record(&mut record).unwrap() { + // We increment here and not at the end of the while loop to take + // the header offset into account. + line += 1; + + // Reset the document to write + object.iter_mut().for_each(|(_, v)| *v = Value::Null); + + for (i, (name, type_)) in typed_fields.iter().enumerate() { + let value = &record[i]; + let trimmed_value = value.trim(); + let value = match type_ { + AllowedType::Number if trimmed_value.is_empty() => Value::Null, + AllowedType::Number => match trimmed_value.parse::() { + Ok(integer) => Value::from(integer), + Err(_) => { + match trimmed_value.parse::() { + Ok(float) => Value::from(float), + Err(error) => { + panic!("bad float") + // return Err(Error::ParseFloat { + // error, + // line, + // value: value.to_string(), + // }); + } + } + } + }, + AllowedType::Boolean if trimmed_value.is_empty() => Value::Null, + AllowedType::Boolean => match trimmed_value.parse::() { + Ok(bool) => Value::from(bool), + Err(error) => { + panic!("bad bool") + // return Err(Error::ParseBool { + // error, + // line, + // value: value.to_string(), + // }); + } + }, + AllowedType::String if value.is_empty() => Value::Null, + AllowedType::String => Value::from(value), + }; + + *object.get_mut(name).unwrap() = value; + } + + serde_json::to_writer(&mut output, &object).unwrap(); + } + + Ok(line.saturating_sub(1) as u64) } /// Reads JSON from temporary file and write an obkv batch to writer. -pub fn read_json(input: BufReader, mut output: &mut BufWriter) -> Result { +pub fn read_json(input: &File, output: impl io::Write) -> Result { + // We memory map to be able to deserailize into a TopLevelMap<'pl> that + // does not allocate when possible and only materialize the first/top level. + let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; + + let mut deserializer = serde_json::Deserializer::from_slice(&input); + let mut output = BufWriter::new(output); let mut count = 0; - let mut deserializer = serde_json::Deserializer::from_reader(input); - match array_each(&mut deserializer, |obj: Object| { + + let count_and_write = |obj: TopLevelMap| { count += 1; serde_json::to_writer(&mut output, &obj) - }) { + }; + + match array_each(&mut deserializer, count_and_write) { // The json data has been deserialized and does not need to be processed again. // The data has been transferred to the writer during the deserialization process. Ok(Ok(_)) => (), - Ok(Err(e)) => return Err(DocumentFormatError::Io(e.into())), + Ok(Err(e)) => return Err(DocumentFormatError::from((PayloadType::Json, e))), Err(e) => { // Attempt to deserialize a single json string when the cause of the exception is not Category.data // Other types of deserialisation exceptions are returned directly to the front-end - if e.classify() != serde_json::error::Category::Data { - return Err(DocumentFormatError::MalformedPayload( - Error::Json(e), - PayloadType::Json, - )); + if e.classify() != Category::Data { + return Err(DocumentFormatError::from((PayloadType::Json, e))); } - todo!("single document/object update") - - // let content: Object = serde_json::from_slice(&mmap) - // .map_err(Error::Json) - // .map_err(|e| (PayloadType::Json, e))?; - // serde_json::to_writer(&mut output, &content).unwrap() + let content: Object = serde_json::from_slice(&input) + .map_err(Error::Json) + .map_err(|e| (PayloadType::Json, e))?; + serde_json::to_writer(&mut output, &content).unwrap() } } - Ok(count) + match output.into_inner() { + Ok(_) => Ok(count), + Err(ie) => Err(DocumentFormatError::Io(ie.into_error())), + } } /// Reads JSON from temporary file and write it into the writer. -pub fn read_ndjson(input: BufReader, mut output: &mut BufWriter) -> Result { +pub fn read_ndjson(input: &File, mut output: impl io::Write) -> Result { + // We memory map to be able to deserailize into a TopLevelMap<'pl> that + // does not allocate when possible and only materialize the first/top level. + let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; + let mut count = 0; - for result in serde_json::Deserializer::from_reader(input).into_iter() { + for result in serde_json::Deserializer::from_slice(&input).into_iter() { count += 1; - // TODO Correctly manage the errors - // Avoid copying the content: use CowStr from milli (move it elsewhere) - let map: Object = result.unwrap(); - serde_json::to_writer(&mut output, &map).unwrap(); + result + .and_then(|map: TopLevelMap| serde_json::to_writer(&mut output, &map)) + .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?; } Ok(count) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 87b448051..055685151 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -423,7 +423,7 @@ async fn document_addition( } }; - let (uuid, update_file) = index_scheduler.create_update_file(dry_run)?; + let (uuid, mut update_file) = index_scheduler.create_update_file(dry_run)?; let temp_file = match tempfile() { Ok(file) => file, @@ -459,20 +459,14 @@ async fn document_addition( return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); } - let read_file = BufReader::new(buffer.into_inner().into_std().await); + let read_file = buffer.into_inner().into_std().await; let documents_count = tokio::task::spawn_blocking(move || { - let mut update_file = std::io::BufWriter::new(update_file); let documents_count = match format { - PayloadType::Json => read_json(read_file, &mut update_file)?, - PayloadType::Csv { delimiter } => read_csv(read_file, &mut update_file, delimiter)?, - PayloadType::Ndjson => read_ndjson(read_file, &mut update_file)?, + PayloadType::Json => read_json(&read_file, &mut update_file)?, + PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?, + PayloadType::Ndjson => read_ndjson(&read_file, &mut update_file)?, }; // we NEED to persist the file here because we moved the `udpate_file` in another task. - // TODO better support of errors - let update_file = match update_file.into_inner() { - Ok(update_file) => update_file, - Err(_) => todo!("handle errors"), - }; update_file.persist()?; Ok(documents_count) }) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 4d7e2aa47..e80b07671 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -9,6 +9,7 @@ use heed::{RoTxn, RwTxn}; pub use partial_dump::PartialDump; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::ThreadPool; +pub use top_level_map::{CowStr, TopLevelMap}; pub use update_by_function::UpdateByFunction; use super::channel::*; diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 31a017c12..3f5c4b3c9 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -1,4 +1,5 @@ pub use document_change::{Deletion, DocumentChange, Insertion, Update}; +pub use indexer::{CowStr, TopLevelMap}; pub use items_pool::ItemsPool; use super::del_add::DelAdd; From b4de06259e265ddd45f07a87c23188a0243dfcee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Sep 2024 10:02:00 +0200 Subject: [PATCH 056/247] Better CSV support --- meilisearch-types/src/document_formats.rs | 86 +++++++++++------------ 1 file changed, 40 insertions(+), 46 deletions(-) diff --git a/meilisearch-types/src/document_formats.rs b/meilisearch-types/src/document_formats.rs index 0b78e4a94..b40a6981d 100644 --- a/meilisearch-types/src/document_formats.rs +++ b/meilisearch-types/src/document_formats.rs @@ -1,9 +1,8 @@ use std::fmt::{self, Debug, Display}; use std::fs::File; -use std::io::{self, BufReader, BufWriter, Seek, Write}; +use std::io::{self, BufWriter}; use std::marker::PhantomData; -use csv::StringRecord; use memmap2::Mmap; use milli::documents::Error; use milli::update::new::TopLevelMap; @@ -11,13 +10,13 @@ use milli::Object; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; use serde_json::error::Category; +use serde_json::{Map, Value}; -use crate::error::deserr_codes::MalformedPayload; use crate::error::{Code, ErrorCode}; type Result = std::result::Result; -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub enum PayloadType { Ndjson, Json, @@ -101,6 +100,16 @@ impl From<(PayloadType, serde_json::Error)> for DocumentFormatError { } } +impl From<(PayloadType, csv::Error)> for DocumentFormatError { + fn from((ty, error): (PayloadType, csv::Error)) -> Self { + if error.is_io_error() { + Self::Io(error.into()) + } else { + Self::MalformedPayload(Error::Csv(error), ty) + } + } +} + impl From for DocumentFormatError { fn from(error: io::Error) -> Self { Self::Io(error) @@ -140,78 +149,63 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) { /// Reads CSV from input and write an obkv batch to writer. pub fn read_csv(input: &File, output: impl io::Write, delimiter: u8) -> Result { - use serde_json::{Map, Value}; - + let ptype = PayloadType::Csv { delimiter }; let mut output = BufWriter::new(output); let mut reader = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(input); - // TODO manage error correctly - // Make sure that we insert the fields ids in order as the obkv writer has this requirement. - let mut typed_fields: Vec<_> = reader - .headers() - .unwrap() - .into_iter() - .map(parse_csv_header) - .map(|(f, t)| (f.to_string(), t)) - .collect(); + let headers = reader.headers().map_err(|e| DocumentFormatError::from((ptype, e)))?.clone(); + let typed_fields: Vec<_> = headers.iter().map(parse_csv_header).collect(); + let mut object: Map<_, _> = headers.iter().map(|k| (k.to_string(), Value::Null)).collect(); - let mut object: Map<_, _> = - reader.headers().unwrap().iter().map(|k| (k.to_string(), Value::Null)).collect(); - - let mut line: usize = 0; + let mut line = 0; let mut record = csv::StringRecord::new(); - while reader.read_record(&mut record).unwrap() { - // We increment here and not at the end of the while loop to take - // the header offset into account. + while reader.read_record(&mut record).map_err(|e| DocumentFormatError::from((ptype, e)))? { + // We increment here and not at the end of the loop + // to take the header offset into account. line += 1; - // Reset the document to write + // Reset the document values object.iter_mut().for_each(|(_, v)| *v = Value::Null); - for (i, (name, type_)) in typed_fields.iter().enumerate() { + for (i, (name, atype)) in typed_fields.iter().enumerate() { let value = &record[i]; let trimmed_value = value.trim(); - let value = match type_ { + let value = match atype { AllowedType::Number if trimmed_value.is_empty() => Value::Null, AllowedType::Number => match trimmed_value.parse::() { Ok(integer) => Value::from(integer), - Err(_) => { - match trimmed_value.parse::() { - Ok(float) => Value::from(float), - Err(error) => { - panic!("bad float") - // return Err(Error::ParseFloat { - // error, - // line, - // value: value.to_string(), - // }); - } + Err(_) => match trimmed_value.parse::() { + Ok(float) => Value::from(float), + Err(error) => { + return Err(DocumentFormatError::MalformedPayload( + Error::ParseFloat { error, line, value: value.to_string() }, + ptype, + )) } - } + }, }, AllowedType::Boolean if trimmed_value.is_empty() => Value::Null, AllowedType::Boolean => match trimmed_value.parse::() { Ok(bool) => Value::from(bool), Err(error) => { - panic!("bad bool") - // return Err(Error::ParseBool { - // error, - // line, - // value: value.to_string(), - // }); + return Err(DocumentFormatError::MalformedPayload( + Error::ParseBool { error, line, value: value.to_string() }, + ptype, + )) } }, AllowedType::String if value.is_empty() => Value::Null, AllowedType::String => Value::from(value), }; - *object.get_mut(name).unwrap() = value; + *object.get_mut(*name).expect("encountered an unknown field") = value; } - serde_json::to_writer(&mut output, &object).unwrap(); + serde_json::to_writer(&mut output, &object) + .map_err(|e| DocumentFormatError::from((ptype, e)))?; } - Ok(line.saturating_sub(1) as u64) + Ok(line as u64) } /// Reads JSON from temporary file and write an obkv batch to writer. From 3848adf5a2d2159e2a14b1c791602382a616c5c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Sep 2024 10:10:51 +0200 Subject: [PATCH 057/247] Improve error management and simplify JSON read --- meilisearch-types/src/document_formats.rs | 36 ++++++++++------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/meilisearch-types/src/document_formats.rs b/meilisearch-types/src/document_formats.rs index b40a6981d..b0411250e 100644 --- a/meilisearch-types/src/document_formats.rs +++ b/meilisearch-types/src/document_formats.rs @@ -10,7 +10,7 @@ use milli::Object; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; use serde_json::error::Category; -use serde_json::{Map, Value}; +use serde_json::{to_writer, Map, Value}; use crate::error::{Code, ErrorCode}; @@ -147,7 +147,7 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) { } } -/// Reads CSV from input and write an obkv batch to writer. +/// Reads CSV from file and write it in NDJSON in a file checking it along the way. pub fn read_csv(input: &File, output: impl io::Write, delimiter: u8) -> Result { let ptype = PayloadType::Csv { delimiter }; let mut output = BufWriter::new(output); @@ -201,32 +201,24 @@ pub fn read_csv(input: &File, output: impl io::Write, delimiter: u8) -> Result Result { // We memory map to be able to deserailize into a TopLevelMap<'pl> that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; + let mut out = BufWriter::new(output); let mut deserializer = serde_json::Deserializer::from_slice(&input); - let mut output = BufWriter::new(output); - let mut count = 0; - - let count_and_write = |obj: TopLevelMap| { - count += 1; - serde_json::to_writer(&mut output, &obj) - }; - - match array_each(&mut deserializer, count_and_write) { + let count = match array_each(&mut deserializer, |obj: TopLevelMap| to_writer(&mut out, &obj)) { // The json data has been deserialized and does not need to be processed again. // The data has been transferred to the writer during the deserialization process. - Ok(Ok(_)) => (), + Ok(Ok(count)) => count, Ok(Err(e)) => return Err(DocumentFormatError::from((PayloadType::Json, e))), Err(e) => { // Attempt to deserialize a single json string when the cause of the exception is not Category.data @@ -238,17 +230,19 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { let content: Object = serde_json::from_slice(&input) .map_err(Error::Json) .map_err(|e| (PayloadType::Json, e))?; - serde_json::to_writer(&mut output, &content).unwrap() + to_writer(&mut out, &content) + .map(|_| 1) + .map_err(|e| DocumentFormatError::from((PayloadType::Json, e)))? } - } + }; - match output.into_inner() { + match out.into_inner() { Ok(_) => Ok(count), Err(ie) => Err(DocumentFormatError::Io(ie.into_error())), } } -/// Reads JSON from temporary file and write it into the writer. +/// Reads NDJSON from file and write it in NDJSON in a file checking it along the way. pub fn read_ndjson(input: &File, mut output: impl io::Write) -> Result { // We memory map to be able to deserailize into a TopLevelMap<'pl> that // does not allocate when possible and only materialize the first/top level. @@ -258,7 +252,7 @@ pub fn read_ndjson(input: &File, mut output: impl io::Write) -> Result { for result in serde_json::Deserializer::from_slice(&input).into_iter() { count += 1; result - .and_then(|map: TopLevelMap| serde_json::to_writer(&mut output, &map)) + .and_then(|map: TopLevelMap| to_writer(&mut output, &map)) .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?; } @@ -305,7 +299,7 @@ where match self.0(value) { Ok(()) => max += 1, Err(e) => return Ok(Err(e)), - }; + } } Ok(Ok(max)) } From 39b5990f64c1bdbf2abf46a02868832cadf00989 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 11 Sep 2024 10:20:23 +0200 Subject: [PATCH 058/247] Mutualize tokenization --- .../extract_fid_word_count_docids.rs | 8 +- .../extract/searchable/extract_word_docids.rs | 463 +++++++++++++++++- .../extract_word_pair_proximity_docids.rs | 2 +- .../src/update/new/extract/searchable/mod.rs | 4 +- .../extract/searchable/tokenize_document.rs | 8 +- milli/src/update/new/indexer/mod.rs | 100 ++-- 6 files changed, 512 insertions(+), 73 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs index 4d90b46d4..b4cf50190 100644 --- a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs @@ -38,7 +38,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { match document_change { DocumentChange::Deletion(inner) => { let mut fid_word_count = HashMap::new(); - let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { + let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| { fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1); Ok(()) }; @@ -58,7 +58,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { } DocumentChange::Update(inner) => { let mut fid_word_count = HashMap::new(); - let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { + let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| { fid_word_count .entry(fid) .and_modify(|(current_count, _new_count)| *current_count += 1) @@ -71,7 +71,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { &mut token_fn, )?; - let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { + let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| { fid_word_count .entry(fid) .and_modify(|(_current_count, new_count)| *new_count += 1) @@ -96,7 +96,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { } DocumentChange::Insertion(inner) => { let mut fid_word_count = HashMap::new(); - let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { + let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| { fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1); Ok(()) }; diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 0cf36cf00..feba205bf 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -1,17 +1,30 @@ -use std::borrow::Cow; +use std::collections::HashMap; +use std::{borrow::Cow, fs::File, num::NonZero}; +use grenad::Merger; +use grenad::MergerBuilder; use heed::RoTxn; +use rayon::iter::IntoParallelIterator; +use rayon::iter::ParallelIterator; -use super::{tokenize_document::DocumentTokenizer, SearchableExtractor}; +use super::{ + tokenize_document::{tokenizer_builder, DocumentTokenizer}, + SearchableExtractor, +}; +use crate::update::new::extract::perm_json_p::contained_in; +use crate::DocumentId; use crate::{ bucketed_position, update::{ - new::{extract::cache::CboCachedSorter, DocumentChange}, - MergeDeladdCboRoaringBitmaps, + create_sorter, + new::{extract::cache::CboCachedSorter, DocumentChange, ItemsPool}, + GrenadParameters, MergeDeladdCboRoaringBitmaps, }, - FieldId, GlobalFieldsIdsMap, Index, Result, + FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE, }; +const MAX_COUNTED_WORDS: usize = 30; + trait ProtoWordDocidsExtractor { fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>; fn attributes_to_extract<'a>( @@ -36,7 +49,7 @@ where ) -> Result<()> { match document_change { DocumentChange::Deletion(inner) => { - let mut token_fn = |fid, pos: u16, word: &str| { + let mut token_fn = |_fname: &str, fid, pos, word: &str| { let key = Self::build_key(fid, pos, word); cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from) }; @@ -47,7 +60,7 @@ where )?; } DocumentChange::Update(inner) => { - let mut token_fn = |fid, pos, word: &str| { + let mut token_fn = |_fname: &str, fid, pos, word: &str| { let key = Self::build_key(fid, pos, word); cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from) }; @@ -57,14 +70,14 @@ where &mut token_fn, )?; - let mut token_fn = |fid, pos, word: &str| { + let mut token_fn = |_fname: &str, fid, pos, word: &str| { let key = Self::build_key(fid, pos, word); cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from) }; document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; } DocumentChange::Insertion(inner) => { - let mut token_fn = |fid, pos, word: &str| { + let mut token_fn = |_fname: &str, fid, pos, word: &str| { let key = Self::build_key(fid, pos, word); cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from) }; @@ -181,3 +194,435 @@ impl ProtoWordDocidsExtractor for WordPositionDocidsExtractor { Cow::Owned(key) } } + +// V2 + +struct WordDocidsCachedSorters { + word_fid_docids: CboCachedSorter, + word_docids: CboCachedSorter, + exact_word_docids: CboCachedSorter, + word_position_docids: CboCachedSorter, + fid_word_count_docids: CboCachedSorter, + fid_word_count: HashMap, + current_docid: Option, +} + +impl WordDocidsCachedSorters { + pub fn new( + indexer: GrenadParameters, + max_memory: Option, + capacity: NonZero, + ) -> Self { + let max_memory = max_memory.map(|max_memory| max_memory / 4); + + let word_fid_docids = CboCachedSorter::new( + capacity, + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + ); + let word_docids = CboCachedSorter::new( + capacity, + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + ); + let exact_word_docids = CboCachedSorter::new( + capacity, + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + ); + let word_position_docids = CboCachedSorter::new( + capacity, + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + ); + let fid_word_count_docids = CboCachedSorter::new( + capacity, + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + ); + + Self { + word_fid_docids, + word_docids, + exact_word_docids, + word_position_docids, + fid_word_count_docids, + fid_word_count: HashMap::new(), + current_docid: None, + } + } + + fn insert_add_u32( + &mut self, + field_id: FieldId, + position: u16, + word: &str, + exact: bool, + docid: u32, + buffer: &mut Vec, + ) -> Result<()> { + let key = word.as_bytes(); + if exact { + self.exact_word_docids.insert_add_u32(key, docid)?; + } else { + self.word_docids.insert_add_u32(key, docid)?; + } + + buffer.clear(); + buffer.extend_from_slice(word.as_bytes()); + buffer.push(0); + buffer.extend_from_slice(&position.to_be_bytes()); + self.word_fid_docids.insert_add_u32(key, docid)?; + + buffer.clear(); + buffer.extend_from_slice(word.as_bytes()); + buffer.push(0); + buffer.extend_from_slice(&field_id.to_be_bytes()); + self.word_position_docids.insert_add_u32(buffer, docid)?; + + if self.current_docid.map_or(false, |id| docid != id) { + self.flush_fid_word_count(buffer)?; + } + + self.fid_word_count + .entry(field_id) + .and_modify(|(_current_count, new_count)| *new_count += 1) + .or_insert((0, 1)); + self.current_docid = Some(docid); + + Ok(()) + } + + fn insert_del_u32( + &mut self, + field_id: FieldId, + position: u16, + word: &str, + exact: bool, + docid: u32, + buffer: &mut Vec, + ) -> Result<()> { + let key = word.as_bytes(); + if exact { + self.exact_word_docids.insert_del_u32(key, docid)?; + } else { + self.word_docids.insert_del_u32(key, docid)?; + } + + buffer.clear(); + buffer.extend_from_slice(word.as_bytes()); + buffer.push(0); + buffer.extend_from_slice(&position.to_be_bytes()); + self.word_fid_docids.insert_del_u32(key, docid)?; + + buffer.clear(); + buffer.extend_from_slice(word.as_bytes()); + buffer.push(0); + buffer.extend_from_slice(&field_id.to_be_bytes()); + self.word_position_docids.insert_del_u32(buffer, docid)?; + + if self.current_docid.map_or(false, |id| docid != id) { + self.flush_fid_word_count(buffer)?; + } + + self.fid_word_count + .entry(field_id) + .and_modify(|(current_count, _new_count)| *current_count += 1) + .or_insert((1, 0)); + self.current_docid = Some(docid); + + Ok(()) + } + + fn flush_fid_word_count(&mut self, buffer: &mut Vec) -> Result<()> { + for (fid, (current_count, new_count)) in self.fid_word_count.drain() { + if current_count != new_count { + if current_count <= MAX_COUNTED_WORDS { + buffer.clear(); + buffer.extend_from_slice(&fid.to_be_bytes()); + buffer.push(current_count as u8); + self.fid_word_count_docids + .insert_del_u32(buffer, self.current_docid.unwrap())?; + } + if new_count <= MAX_COUNTED_WORDS { + buffer.clear(); + buffer.extend_from_slice(&fid.to_be_bytes()); + buffer.push(new_count as u8); + self.fid_word_count_docids + .insert_add_u32(buffer, self.current_docid.unwrap())?; + } + } + } + + Ok(()) + } +} + +struct WordDocidsMergerBuilders { + word_fid_docids: MergerBuilder, + word_docids: MergerBuilder, + exact_word_docids: MergerBuilder, + word_position_docids: MergerBuilder, + fid_word_count_docids: MergerBuilder, +} + +pub struct WordDocidsMergers { + pub word_fid_docids: Merger, + pub word_docids: Merger, + pub exact_word_docids: Merger, + pub word_position_docids: Merger, + pub fid_word_count_docids: Merger, +} + +impl WordDocidsMergerBuilders { + fn new() -> Self { + Self { + word_fid_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), + word_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), + exact_word_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), + word_position_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), + fid_word_count_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), + } + } + + fn add_sorters(&mut self, other: WordDocidsCachedSorters) -> Result<()> { + let WordDocidsCachedSorters { + word_fid_docids, + word_docids, + exact_word_docids, + word_position_docids, + fid_word_count_docids, + fid_word_count: _, + current_docid: _, + } = other; + + let sorter = word_fid_docids.into_sorter()?; + let readers = sorter.into_reader_cursors()?; + self.word_fid_docids.extend(readers); + let sorter = word_docids.into_sorter()?; + let readers = sorter.into_reader_cursors()?; + self.word_docids.extend(readers); + let sorter = exact_word_docids.into_sorter()?; + let readers = sorter.into_reader_cursors()?; + self.exact_word_docids.extend(readers); + let sorter = word_position_docids.into_sorter()?; + let readers = sorter.into_reader_cursors()?; + self.word_position_docids.extend(readers); + let sorter = fid_word_count_docids.into_sorter()?; + let readers = sorter.into_reader_cursors()?; + self.fid_word_count_docids.extend(readers); + + Ok(()) + } + + fn build(self) -> WordDocidsMergers { + WordDocidsMergers { + word_fid_docids: self.word_fid_docids.build(), + word_docids: self.word_docids.build(), + exact_word_docids: self.exact_word_docids.build(), + word_position_docids: self.word_position_docids.build(), + fid_word_count_docids: self.fid_word_count_docids.build(), + } + } +} + +pub struct WordDocidsExtractors; + +impl WordDocidsExtractors { + pub fn run_extraction( + index: &Index, + fields_ids_map: &GlobalFieldsIdsMap, + indexer: GrenadParameters, + document_changes: impl IntoParallelIterator>, + ) -> Result { + let max_memory = indexer.max_memory_by_thread(); + + let rtxn = index.read_txn()?; + let stop_words = index.stop_words(&rtxn)?; + let allowed_separators = index.allowed_separators(&rtxn)?; + let allowed_separators: Option> = + allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let dictionary = index.dictionary(&rtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let builder = tokenizer_builder( + stop_words.as_ref(), + allowed_separators.as_deref(), + dictionary.as_deref(), + ); + let tokenizer = builder.into_tokenizer(); + + let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; + let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?; + let localized_attributes_rules = + index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); + + let document_tokenizer = DocumentTokenizer { + tokenizer: &tokenizer, + attribute_to_extract: attributes_to_extract.as_deref(), + attribute_to_skip: attributes_to_skip.as_slice(), + localized_attributes_rules: &localized_attributes_rules, + max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, + }; + + let context_pool = ItemsPool::new(|| { + Ok(( + index.read_txn()?, + &document_tokenizer, + fields_ids_map.clone(), + WordDocidsCachedSorters::new( + indexer, + max_memory, + // TODO use a better value + 200_000.try_into().unwrap(), + ), + )) + }); + + document_changes.into_par_iter().try_for_each(|document_change| { + context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + document_tokenizer, + fields_ids_map, + cached_sorter, + document_change?, + ) + }) + })?; + + let mut builder = WordDocidsMergerBuilders::new(); + for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() { + builder.add_sorters(cache)?; + } + + Ok(builder.build()) + } + + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + document_tokenizer: &DocumentTokenizer, + fields_ids_map: &mut GlobalFieldsIdsMap, + cached_sorter: &mut WordDocidsCachedSorters, + document_change: DocumentChange, + ) -> Result<()> { + let exact_attributes = index.exact_attributes(&rtxn)?; + let is_exact_attribute = + |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr)); + let mut buffer = Vec::new(); + match document_change { + DocumentChange::Deletion(inner) => { + let mut token_fn = |fname: &str, fid, pos, word: &str| { + cached_sorter + .insert_del_u32( + fid, + pos, + word, + is_exact_attribute(fname), + inner.docid(), + &mut buffer, + ) + .map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut token_fn, + )?; + } + DocumentChange::Update(inner) => { + let mut token_fn = |fname: &str, fid, pos, word: &str| { + cached_sorter + .insert_del_u32( + fid, + pos, + word, + is_exact_attribute(fname), + inner.docid(), + &mut buffer, + ) + .map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut token_fn, + )?; + + let mut token_fn = |fname: &str, fid, pos, word: &str| { + cached_sorter + .insert_add_u32( + fid, + pos, + word, + is_exact_attribute(fname), + inner.docid(), + &mut buffer, + ) + .map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + } + DocumentChange::Insertion(inner) => { + let mut token_fn = |fname: &str, fid, pos, word: &str| { + cached_sorter + .insert_add_u32( + fid, + pos, + word, + is_exact_attribute(fname), + inner.docid(), + &mut buffer, + ) + .map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + } + } + + cached_sorter.flush_fid_word_count(&mut buffer) + } + + fn attributes_to_extract<'a>( + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + index.user_defined_searchable_fields(rtxn).map_err(Into::into) + } + + fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { + Ok(vec![]) + } +} diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index dbd08901b..f0d53833b 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -149,7 +149,7 @@ fn process_document_tokens( word_positions: &mut VecDeque<(String, u16)>, word_pair_proximity: &mut BTreeMap<(String, String), u8>, ) -> Result<()> { - let mut token_fn = |_fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| { // drain the proximity window until the head word is considered close to the word we are inserting. while word_positions .front() diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index c3ac30b17..468fded9a 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -7,8 +7,8 @@ use std::fs::File; pub use extract_fid_word_count_docids::FidWordCountDocidsExtractor; pub use extract_word_docids::{ - ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor, - WordPositionDocidsExtractor, + ExactWordDocidsExtractor, WordDocidsExtractor, WordDocidsExtractors, WordDocidsMergers, + WordFidDocidsExtractor, WordPositionDocidsExtractor, }; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; use grenad::Merger; diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index 7e23c9301..829bf8a49 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -26,7 +26,7 @@ impl<'a> DocumentTokenizer<'a> { &self, obkv: &KvReaderFieldId, field_id_map: &mut GlobalFieldsIdsMap, - token_fn: &mut impl FnMut(FieldId, u16, &str) -> Result<()>, + token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>, ) -> Result<()> { let mut field_position = HashMap::new(); let mut field_name = String::new(); @@ -56,7 +56,7 @@ impl<'a> DocumentTokenizer<'a> { Value::Number(n) => { let token = n.to_string(); if let Ok(position) = (*position).try_into() { - token_fn(field_id, position, token.as_str())?; + token_fn(name, field_id, position, token.as_str())?; } Ok(()) @@ -80,7 +80,7 @@ impl<'a> DocumentTokenizer<'a> { if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { *position = index; if let Ok(position) = (*position).try_into() { - token_fn(field_id, position, token)?; + token_fn(name, field_id, position, token)?; } } } @@ -235,7 +235,7 @@ mod test { let mut words = std::collections::BTreeMap::new(); document_tokenizer - .tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| { + .tokenize_document(obkv, &mut global_fields_ids_map, &mut |_fname, fid, pos, word| { words.insert([fid, pos], word.to_string()); Ok(()) }) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index e80b07671..5ef3439cc 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -58,7 +58,7 @@ where { let (merger_sender, writer_receiver) = merger_writer_channel(10_000); // This channel acts as a rendezvous point to ensure that we are one task ahead - let (extractor_sender, merger_receiver) = extractors_merger_channels(0); + let (extractor_sender, merger_receiver) = extractors_merger_channels(4); let fields_ids_map_lock = RwLock::new(fields_ids_map); let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); @@ -103,62 +103,56 @@ where { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); let _entered = span.enter(); - extract_and_send_docids::( - index, - &global_fields_ids_map, - grenad_parameters, - document_changes.clone(), - &extractor_sender, - )?; + + let WordDocidsMergers { + word_fid_docids, + word_docids, + exact_word_docids, + word_position_docids, + fid_word_count_docids, + } = WordDocidsExtractors::run_extraction(index, &global_fields_ids_map, grenad_parameters, document_changes.clone())?; + extractor_sender.send_searchable::(word_docids).unwrap(); + extractor_sender.send_searchable::(word_fid_docids).unwrap(); + extractor_sender.send_searchable::(exact_word_docids).unwrap(); + extractor_sender.send_searchable::(word_position_docids).unwrap(); + extractor_sender.send_searchable::(fid_word_count_docids).unwrap(); } - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_fid_docids"); - let _entered = span.enter(); - extract_and_send_docids::( - index, - &global_fields_ids_map, - grenad_parameters, - document_changes.clone(), - &extractor_sender, - )?; - } + // { + // let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids"); + // let _entered = span.enter(); + // extract_and_send_docids::( + // index, + // &global_fields_ids_map, + // grenad_parameters, + // document_changes.clone(), + // &extractor_sender, + // )?; + // } - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids"); - let _entered = span.enter(); - extract_and_send_docids::( - index, - &global_fields_ids_map, - grenad_parameters, - document_changes.clone(), - &extractor_sender, - )?; - } + // { + // let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids"); + // let _entered = span.enter(); + // extract_and_send_docids::( + // index, + // &global_fields_ids_map, + // grenad_parameters, + // document_changes.clone(), + // &extractor_sender, + // )?; + // } - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids"); - let _entered = span.enter(); - extract_and_send_docids::( - index, - &global_fields_ids_map, - grenad_parameters, - document_changes.clone(), - &extractor_sender, - )?; - } - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids"); - let _entered = span.enter(); - extract_and_send_docids::( - index, - &global_fields_ids_map, - GrenadParameters::default(), - document_changes.clone(), - &extractor_sender, - )?; - } + // { + // let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids"); + // let _entered = span.enter(); + // extract_and_send_docids::( + // index, + // &global_fields_ids_map, + // GrenadParameters::default(), + // document_changes.clone(), + // &extractor_sender, + // )?; + // } { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); From 2b317c681b4735bd8240ba6be7551b55266dcd97 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 11 Sep 2024 11:49:26 +0200 Subject: [PATCH 059/247] Build mergers in parallel --- .../extract/searchable/extract_word_docids.rs | 91 ++++++++++++------- .../extract_word_pair_proximity_docids.rs | 2 +- .../src/update/new/extract/searchable/mod.rs | 55 +++++++---- 3 files changed, 95 insertions(+), 53 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index feba205bf..f454269f6 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -426,21 +426,38 @@ impl WordDocidsMergerBuilders { current_docid: _, } = other; - let sorter = word_fid_docids.into_sorter()?; - let readers = sorter.into_reader_cursors()?; - self.word_fid_docids.extend(readers); - let sorter = word_docids.into_sorter()?; - let readers = sorter.into_reader_cursors()?; - self.word_docids.extend(readers); - let sorter = exact_word_docids.into_sorter()?; - let readers = sorter.into_reader_cursors()?; - self.exact_word_docids.extend(readers); - let sorter = word_position_docids.into_sorter()?; - let readers = sorter.into_reader_cursors()?; - self.word_position_docids.extend(readers); - let sorter = fid_word_count_docids.into_sorter()?; - let readers = sorter.into_reader_cursors()?; - self.fid_word_count_docids.extend(readers); + let mut word_fid_docids_readers = Ok(vec![]); + let mut word_docids_readers = Ok(vec![]); + let mut exact_word_docids_readers = Ok(vec![]); + let mut word_position_docids_readers = Ok(vec![]); + let mut fid_word_count_docids_readers = Ok(vec![]); + rayon::scope(|s| { + s.spawn(|_| { + word_fid_docids_readers = + word_fid_docids.into_sorter().and_then(|s| s.into_reader_cursors()); + }); + s.spawn(|_| { + word_docids_readers = + word_docids.into_sorter().and_then(|s| s.into_reader_cursors()); + }); + s.spawn(|_| { + exact_word_docids_readers = + exact_word_docids.into_sorter().and_then(|s| s.into_reader_cursors()); + }); + s.spawn(|_| { + word_position_docids_readers = + word_position_docids.into_sorter().and_then(|s| s.into_reader_cursors()); + }); + s.spawn(|_| { + fid_word_count_docids_readers = + fid_word_count_docids.into_sorter().and_then(|s| s.into_reader_cursors()); + }); + }); + self.word_fid_docids.extend(word_fid_docids_readers?); + self.word_docids.extend(word_docids_readers?); + self.exact_word_docids.extend(exact_word_docids_readers?); + self.word_position_docids.extend(word_position_docids_readers?); + self.fid_word_count_docids.extend(fid_word_count_docids_readers?); Ok(()) } @@ -509,25 +526,35 @@ impl WordDocidsExtractors { )) }); - document_changes.into_par_iter().try_for_each(|document_change| { - context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { - Self::extract_document_change( - &*rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, - ) - }) - })?; - - let mut builder = WordDocidsMergerBuilders::new(); - for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() { - builder.add_sorters(cache)?; + { + let span = + tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); + let _entered = span.enter(); + document_changes.into_par_iter().try_for_each(|document_change| { + context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + document_tokenizer, + fields_ids_map, + cached_sorter, + document_change?, + ) + }) + })?; } - Ok(builder.build()) + { + let span = + tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); + let _entered = span.enter(); + let mut builder = WordDocidsMergerBuilders::new(); + for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() { + builder.add_sorters(cache)?; + } + + Ok(builder.build()) + } } fn extract_document_change( diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index f0d53833b..7b3706424 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -107,7 +107,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { cached_sorter.insert_add_u32(key, docid)?; } } - }; + } } Ok(()) diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 468fded9a..7e096591e 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -13,7 +13,7 @@ pub use extract_word_docids::{ pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; use grenad::Merger; use heed::RoTxn; -use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; @@ -78,27 +78,42 @@ pub trait SearchableExtractor { )) }); - document_changes.into_par_iter().try_for_each(|document_change| { - context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { - Self::extract_document_change( - &*rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, - ) - }) - })?; - - let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); - for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() { - let sorter = cache.into_sorter()?; - let readers = sorter.into_reader_cursors()?; - builder.extend(readers); + { + let span = + tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); + let _entered = span.enter(); + document_changes.into_par_iter().try_for_each(|document_change| { + context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + document_tokenizer, + fields_ids_map, + cached_sorter, + document_change?, + ) + }) + })?; } + { + let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); + let span = + tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); + let _entered = span.enter(); - Ok(builder.build()) + let readers: Vec<_> = context_pool + .into_items() + .par_bridge() + .map(|(_rtxn, _tokenizer, _fields_ids_map, cached_sorter)| { + let sorter = cached_sorter.into_sorter()?; + sorter.into_reader_cursors() + }) + .collect(); + for reader in readers { + builder.extend(reader?); + } + Ok(builder.build()) + } } fn extract_document_change( From 2a0ad0982fbefb3d453951873814c4df7c094f74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Sep 2024 10:20:26 +0200 Subject: [PATCH 060/247] Fix the document counter --- milli/src/update/new/indexer/document_operation.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 0521d43f9..935c130e5 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -1,6 +1,5 @@ use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; -use std::fmt; use std::sync::Arc; use heed::types::Bytes; @@ -13,7 +12,7 @@ use IndexDocumentsMethod as Idm; use super::super::document_change::DocumentChange; use super::super::items_pool::ItemsPool; use super::top_level_map::{CowStr, TopLevelMap}; -use super::{top_level_map, DocumentChanges}; +use super::DocumentChanges; use crate::documents::PrimaryKey; use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; @@ -57,7 +56,8 @@ impl<'pl> DocumentOperation<'pl> { /// The payload is expected to be in the grenad format pub fn add_documents(&mut self, payload: &'pl Mmap) -> Result { payload.advise(memmap2::Advice::Sequential)?; - let document_count = memchr::Memchr::new(b'\n', &payload[..]).count(); + let document_count = + memchr::memmem::find_iter(&payload[..], "}{").count().saturating_add(1); self.operations.push(Payload::Addition(&payload[..])); Ok(PayloadStats { bytes: payload.len() as u64, document_count }) } From 3e9198ebaa04fe0865aaed582a88f55be3388fd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Sep 2024 15:59:30 +0200 Subject: [PATCH 061/247] Support guessing primary key again --- index-scheduler/src/batch.rs | 47 ++++++++------- milli/src/update/new/indexer/mod.rs | 57 ++++++++----------- milli/src/update/new/indexer/top_level_map.rs | 30 +++++++++- 3 files changed, 79 insertions(+), 55 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 506ba6581..f9463a137 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -29,14 +29,17 @@ use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; -use meilisearch_types::milli::update::new::indexer::{self, guess_primary_key, DocumentChanges}; +use meilisearch_types::milli::update::new::indexer::{ + self, retrieve_or_guess_primary_key, DocumentChanges, +}; +use meilisearch_types::milli::update::new::TopLevelMap; use meilisearch_types::milli::update::{ IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; -use meilisearch_types::milli::{self, Filter, Object}; +use meilisearch_types::milli::{self, Filter, InternalError, Object}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; @@ -1296,22 +1299,34 @@ impl IndexScheduler { }) .unwrap(); - // let content_file = self.file_store.get_update(*first_addition_uuid)?; - // let reader = - // DocumentsBatchReader::from_reader(content_file).map_err(milli::Error::from)?; - // let (cursor, documents_batch_index) = reader.into_cursor_and_fields_index(); - // let primary_key = - // guess_primary_key(&rtxn, index, cursor, &documents_batch_index)?.unwrap(); - let mut content_files = Vec::new(); for operation in &operations { if let DocumentOperation::Add(content_uuid) = operation { let content_file = self.file_store.get_update(*content_uuid)?; let mmap = unsafe { memmap2::Mmap::map(&content_file)? }; - content_files.push(mmap); + if !mmap.is_empty() { + content_files.push(mmap); + } } } + let mut fields_ids_map = index.fields_ids_map(&rtxn)?; + let first_document = match content_files.first() { + Some(mmap) => { + let mut iter = serde_json::Deserializer::from_slice(mmap).into_iter(); + iter.next().transpose().map_err(|e| e.into()).map_err(Error::IoError)? + } + None => None, + }; + + let primary_key = retrieve_or_guess_primary_key( + &rtxn, + index, + &mut fields_ids_map, + first_document.as_ref(), + )? + .unwrap(); + let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(method); for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) { @@ -1364,21 +1379,9 @@ impl IndexScheduler { } if !tasks.iter().all(|res| res.error.is_some()) { - let mut fields_ids_map = index.fields_ids_map(&rtxn)?; /// TODO create a pool if needed // let pool = indexer_config.thread_pool.unwrap(); let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); - // let fields_ids_map = RwLock::new(fields_ids_map); - - /// TODO correctly guess the primary key in a NDJSON - let pk = match std::env::var("MEILI_PRIMARY_KEY") { - Ok(pk) => pk, - Err(VarError::NotPresent) => "id".to_string(), - Err(e) => panic!("primary key error: {e}"), - }; - - fields_ids_map.insert(&pk); - let primary_key = PrimaryKey::new(&pk, &fields_ids_map).unwrap(); let param = (index, &rtxn, &primary_key); let document_changes = indexer.document_changes(&mut fields_ids_map, param)?; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 5ef3439cc..0273d4fe2 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -22,7 +22,7 @@ use crate::documents::{ }; use crate::update::new::channel::{DatabaseType, ExtractorSender}; use crate::update::GrenadParameters; -use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, UserError}; mod document_deletion; mod document_operation; @@ -242,53 +242,46 @@ fn extract_and_send_docids( Ok(sender.send_searchable::(merger).unwrap()) } +/// Returns the primary key *field id* that has already been set for this index or the +/// one we will guess by searching for the first key that contains "id" as a substring. /// TODO move this elsewhere -pub fn guess_primary_key<'a>( +pub fn retrieve_or_guess_primary_key<'a>( rtxn: &'a RoTxn<'a>, index: &Index, - mut cursor: DocumentsBatchCursor, - documents_batch_index: &'a DocumentsBatchIndex, + fields_ids_map: &mut FieldsIdsMap, + first_document: Option<&'a TopLevelMap<'_>>, ) -> Result, UserError>> { - // The primary key *field id* that has already been set for this index or the one - // we will guess by searching for the first key that contains "id" as a substring. match index.primary_key(rtxn)? { - Some(primary_key) => match PrimaryKey::new(primary_key, documents_batch_index) { + Some(primary_key) => match PrimaryKey::new(primary_key, fields_ids_map) { Some(primary_key) => Ok(Ok(primary_key)), - None => match cursor.next_document()? { - Some(first_document) => Ok(Err(UserError::MissingDocumentId { - primary_key: primary_key.to_string(), - document: obkv_to_object(first_document, documents_batch_index)?, - })), - None => unreachable!("Called with reader.is_empty()"), - }, + None => unreachable!("Why is the primary key not in the fidmap?"), }, None => { - let mut guesses: Vec<(u16, &str)> = documents_batch_index - .iter() - .filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) - .map(|(field_id, name)| (*field_id, name.as_str())) + let first_document = match first_document { + Some(document) => document, + None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), + }; + + let mut guesses: Vec<&str> = first_document + .keys() + .map(AsRef::as_ref) + .filter(|name| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) .collect(); - // sort the keys in a deterministic, obvious way, so that fields are always in the same order. - guesses.sort_by(|(_, left_name), (_, right_name)| { - // shortest name first - left_name.len().cmp(&right_name.len()).then_with( - // then alphabetical order - || left_name.cmp(right_name), - ) - }); + // sort the keys in lexicographical order, so that fields are always in the same order. + guesses.sort_unstable(); match guesses.as_slice() { [] => Ok(Err(UserError::NoPrimaryKeyCandidateFound)), - [(field_id, name)] => { + [name] => { tracing::info!("Primary key was not specified in index. Inferred to '{name}'"); - Ok(Ok(PrimaryKey::Flat { name, field_id: *field_id })) + match fields_ids_map.insert(name) { + Some(field_id) => Ok(Ok(PrimaryKey::Flat { name, field_id })), + None => Ok(Err(UserError::AttributeLimitReached)), + } } multiple => Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { - candidates: multiple - .iter() - .map(|(_, candidate)| candidate.to_string()) - .collect(), + candidates: multiple.iter().map(|candidate| candidate.to_string()).collect(), })), } } diff --git a/milli/src/update/new/indexer/top_level_map.rs b/milli/src/update/new/indexer/top_level_map.rs index d82e42dca..f79b6e9ee 100644 --- a/milli/src/update/new/indexer/top_level_map.rs +++ b/milli/src/update/new/indexer/top_level_map.rs @@ -1,13 +1,41 @@ use std::borrow::{Borrow, Cow}; use std::collections::BTreeMap; -use std::fmt; +use std::{fmt, ops}; use serde::{Deserialize, Serialize}; use serde_json::value::RawValue; +use serde_json::{Map, Value}; #[derive(Deserialize, Serialize)] pub struct TopLevelMap<'p>(#[serde(borrow)] pub BTreeMap, &'p RawValue>); +impl TryFrom<&'_ TopLevelMap<'_>> for Map { + type Error = serde_json::Error; + + fn try_from(tlmap: &TopLevelMap<'_>) -> Result { + let mut object = Map::new(); + for (k, v) in &tlmap.0 { + let value = serde_json::from_str(v.get())?; + object.insert(k.to_string(), value); + } + Ok(object) + } +} + +impl<'p> ops::Deref for TopLevelMap<'p> { + type Target = BTreeMap, &'p RawValue>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl ops::DerefMut for TopLevelMap<'_> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + #[derive(Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] pub struct CowStr<'p>(#[serde(borrow)] pub Cow<'p, str>); From e7d9db078fbbfecf2f1c393fac7ddde490994c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Sep 2024 18:27:00 +0200 Subject: [PATCH 062/247] Use the right key name when convertir from CSV to NDJSON --- meilisearch-types/src/document_formats.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/meilisearch-types/src/document_formats.rs b/meilisearch-types/src/document_formats.rs index b0411250e..4b9d59462 100644 --- a/meilisearch-types/src/document_formats.rs +++ b/meilisearch-types/src/document_formats.rs @@ -155,7 +155,8 @@ pub fn read_csv(input: &File, output: impl io::Write, delimiter: u8) -> Result = headers.iter().map(parse_csv_header).collect(); - let mut object: Map<_, _> = headers.iter().map(|k| (k.to_string(), Value::Null)).collect(); + let mut object: Map<_, _> = + typed_fields.iter().map(|(k, _)| (k.to_string(), Value::Null)).collect(); let mut line = 0; let mut record = csv::StringRecord::new(); From 0d868f36d74dea24748d507056eb4e20285461e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Sep 2024 18:38:04 +0200 Subject: [PATCH 063/247] Make sure we always use a BufWriter to write the update files --- meilisearch-types/src/document_formats.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/meilisearch-types/src/document_formats.rs b/meilisearch-types/src/document_formats.rs index 4b9d59462..b40c4d0b6 100644 --- a/meilisearch-types/src/document_formats.rs +++ b/meilisearch-types/src/document_formats.rs @@ -244,10 +244,11 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { } /// Reads NDJSON from file and write it in NDJSON in a file checking it along the way. -pub fn read_ndjson(input: &File, mut output: impl io::Write) -> Result { +pub fn read_ndjson(input: &File, output: impl io::Write) -> Result { // We memory map to be able to deserailize into a TopLevelMap<'pl> that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; + let mut output = BufWriter::new(output); let mut count = 0; for result in serde_json::Deserializer::from_slice(&input).into_iter() { From eb9a20ff0b9cd3f6c49e89bf4541909fde0a6299 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 12 Sep 2024 11:08:18 +0200 Subject: [PATCH 064/247] Fix fid_word_docids extraction --- .../src/update/new/extract/searchable/extract_word_docids.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index f454269f6..652964b11 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -302,7 +302,7 @@ impl WordDocidsCachedSorters { buffer.extend_from_slice(word.as_bytes()); buffer.push(0); buffer.extend_from_slice(&position.to_be_bytes()); - self.word_fid_docids.insert_add_u32(key, docid)?; + self.word_fid_docids.insert_add_u32(buffer, docid)?; buffer.clear(); buffer.extend_from_slice(word.as_bytes()); @@ -343,7 +343,7 @@ impl WordDocidsCachedSorters { buffer.extend_from_slice(word.as_bytes()); buffer.push(0); buffer.extend_from_slice(&position.to_be_bytes()); - self.word_fid_docids.insert_del_u32(key, docid)?; + self.word_fid_docids.insert_del_u32(buffer, docid)?; buffer.clear(); buffer.extend_from_slice(word.as_bytes()); From aa69308e4555ede4f96f9068881524ff6b4988bb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 12 Sep 2024 11:48:00 +0200 Subject: [PATCH 065/247] Use a bufWriter to build word FSTs --- milli/src/update/new/merger.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index b38dc0865..22c4baf26 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -6,6 +6,7 @@ use heed::types::Bytes; use heed::{Database, RoTxn}; use memmap2::Mmap; use roaring::RoaringBitmap; +use std::io::BufWriter; use tempfile::tempfile; use super::channel::*; @@ -60,8 +61,8 @@ pub fn merge_grenad_entries( let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); let _entered = span.enter(); - let mut add_words_fst = SetBuilder::new(tempfile()?)?; - let mut del_words_fst = SetBuilder::new(tempfile()?)?; + let mut add_words_fst = SetBuilder::new(BufWriter::new(tempfile()?))?; + let mut del_words_fst = SetBuilder::new(BufWriter::new(tempfile()?))?; merge_and_send_docids( merger, @@ -153,16 +154,16 @@ pub fn merge_grenad_entries( } fn compute_new_words_fst( - add_words_fst: SetBuilder, - del_words_fst: SetBuilder, + add_words_fst: SetBuilder>, + del_words_fst: SetBuilder>, words_fst: Set>, ) -> Result { let add_words_fst_file = add_words_fst.into_inner()?; - let add_words_fst_mmap = unsafe { Mmap::map(&add_words_fst_file)? }; + let add_words_fst_mmap = unsafe { Mmap::map(&add_words_fst_file.into_inner().unwrap())? }; let add_words_fst = Set::new(&add_words_fst_mmap)?; let del_words_fst_file = del_words_fst.into_inner()?; - let del_words_fst_mmap = unsafe { Mmap::map(&del_words_fst_file)? }; + let del_words_fst_mmap = unsafe { Mmap::map(&del_words_fst_file.into_inner().unwrap())? }; let del_words_fst = Set::new(&del_words_fst_mmap)?; let diff = words_fst.op().add(&del_words_fst).difference(); From ff5d3b59f5067b000ba3a0e321002c8723da01f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 12 Sep 2024 12:01:31 +0200 Subject: [PATCH 066/247] Move the document id extraction to the primary key code --- milli/src/documents/primary_key.rs | 43 +++++++++++- .../update/new/indexer/document_operation.rs | 67 +++++-------------- milli/src/update/new/indexer/top_level_map.rs | 8 +++ 3 files changed, 65 insertions(+), 53 deletions(-) diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs index 22918f8fc..b6a236623 100644 --- a/milli/src/documents/primary_key.rs +++ b/milli/src/documents/primary_key.rs @@ -1,8 +1,10 @@ +use std::borrow::Cow; use std::iter; use std::result::Result as StdResult; -use serde_json::Value; +use serde_json::{from_str, Value}; +use crate::update::new::{CowStr, TopLevelMap}; use crate::{FieldId, InternalError, Object, Result, UserError}; /// The symbol used to define levels in a nested primary key. @@ -100,6 +102,45 @@ impl<'a> PrimaryKey<'a> { } } + /// Returns the document ID based on the primary and + /// search for it recursively in zero-copy-deserialized documents. + pub fn document_id_from_top_level_map<'p>( + &self, + document: &TopLevelMap<'p>, + ) -> Result, DocumentIdExtractionError>> { + fn get_docid<'p>( + document: &TopLevelMap<'p>, + primary_key: &[&str], + ) -> Result, DocumentIdExtractionError>> { + match primary_key { + [] => unreachable!("arrrgh"), // would None be ok? + [primary_key] => match document.0.get(*primary_key) { + Some(value) => match from_str::(value.get()) { + Ok(value) => Ok(Ok(CowStr(Cow::Owned(value.to_string())))), + Err(_) => match from_str(value.get()) { + Ok(document_id) => Ok(Ok(document_id)), + Err(e) => Ok(Err(DocumentIdExtractionError::InvalidDocumentId( + UserError::SerdeJson(e), + ))), + }, + }, + None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), + }, + [head, tail @ ..] => match document.0.get(*head) { + Some(value) => { + let document = from_str(value.get()).map_err(InternalError::SerdeJson)?; + get_docid(&document, tail) + } + None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), + }, + } + } + + /// TODO do not allocate a vec everytime here + let primary_key: Vec<_> = self.name().split(PRIMARY_KEY_SPLIT_SYMBOL).collect(); + get_docid(document, &primary_key) + } + /// Returns an `Iterator` that gives all the possible fields names the primary key /// can have depending of the first level name and depth of the objects. pub fn possible_level_names(&self) -> impl Iterator + '_ { diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 935c130e5..ed8f1c93f 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -13,7 +13,7 @@ use super::super::document_change::DocumentChange; use super::super::items_pool::ItemsPool; use super::top_level_map::{CowStr, TopLevelMap}; use super::DocumentChanges; -use crate::documents::PrimaryKey; +use crate::documents::{DocumentIdExtractionError, PrimaryKey}; use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; @@ -98,37 +98,22 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { // TODO we must manage the TooManyDocumentIds,InvalidDocumentId // we must manage the unwrap let external_document_id = - match get_docid(&document, &[primary_key.name()]).unwrap() { - Some(document_id) => document_id, - None => { - return Err(UserError::MissingDocumentId { + match primary_key.document_id_from_top_level_map(&document)? { + Ok(document_id) => Ok(document_id), + Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e), + Err(DocumentIdExtractionError::MissingDocumentId) => { + Err(UserError::MissingDocumentId { primary_key: primary_key.name().to_string(), - document: todo!(), - // document: obkv_to_object(document, &batch_index)?, - } - .into()); + document: document.try_into().unwrap(), + }) } - }; - - // let external_document_id = - // match primary_key.document_id(document, &batch_index)? { - // Ok(document_id) => Ok(document_id), - // Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => { - // Err(user_error) - // } - // Err(DocumentIdExtractionError::MissingDocumentId) => { - // Err(UserError::MissingDocumentId { - // primary_key: primary_key.name().to_string(), - // document: obkv_to_object(document, &batch_index)?, - // }) - // } - // Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { - // Err(UserError::TooManyDocumentIds { - // primary_key: primary_key.name().to_string(), - // document: obkv_to_object(document, &batch_index)?, - // }) - // } - // }?; + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: document.try_into().unwrap(), + }) + } + }?; let current_offset = iter.byte_offset(); let document_operation = InnerDocOp::Addition(DocumentOffset { @@ -397,25 +382,3 @@ impl MergeChanges for MergeDocumentForUpdates { } } } - -/// Returns the document ID based on the primary and -/// search for it recursively in zero-copy-deserialized documents. -fn get_docid<'p>( - map: &TopLevelMap<'p>, - primary_key: &[&str], -) -> serde_json::Result>> { - match primary_key { - [] => unreachable!("arrrgh"), // would None be ok? - [primary_key] => match map.0.get(*primary_key) { - Some(value) => match from_str::(value.get()) { - Ok(value) => Ok(Some(CowStr(Cow::Owned(value.to_string())))), - Err(_) => Ok(Some(from_str(value.get())?)), - }, - None => Ok(None), - }, - [head, tail @ ..] => match map.0.get(*head) { - Some(value) => get_docid(&from_str(value.get())?, tail), - None => Ok(None), - }, - } -} diff --git a/milli/src/update/new/indexer/top_level_map.rs b/milli/src/update/new/indexer/top_level_map.rs index f79b6e9ee..aebb64bc9 100644 --- a/milli/src/update/new/indexer/top_level_map.rs +++ b/milli/src/update/new/indexer/top_level_map.rs @@ -22,6 +22,14 @@ impl TryFrom<&'_ TopLevelMap<'_>> for Map { } } +impl TryFrom> for Map { + type Error = serde_json::Error; + + fn try_from(tlmap: TopLevelMap<'_>) -> Result { + TryFrom::try_from(&tlmap) + } +} + impl<'p> ops::Deref for TopLevelMap<'p> { type Target = BTreeMap, &'p RawValue>; From b2f4e67c9abf34373dfdc1b1881666957f0ddf8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 12 Sep 2024 15:38:31 +0200 Subject: [PATCH 067/247] Do not store useless updates --- .../update/new/indexer/document_operation.rs | 79 +++++++++++-------- milli/src/update/new/indexer/mod.rs | 7 +- 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index ed8f1c93f..1670c8145 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -137,9 +137,22 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { (docid, vec![document_operation]), ); } - // TODO clean the code to make sure we clean the useless operations - // add a method to the MergeChanges trait - Some((_, offsets)) => offsets.push(document_operation), + Some((_, offsets)) => { + let useless_previous_addition = match self.index_documents_method { + IndexDocumentsMethod::ReplaceDocuments => { + MergeDocumentForReplacement::USELESS_PREVIOUS_CHANGES + } + IndexDocumentsMethod::UpdateDocuments => { + MergeDocumentForUpdates::USELESS_PREVIOUS_CHANGES + } + }; + + if useless_previous_addition { + offsets.clear(); + } + + offsets.push(document_operation); + } } previous_offset = iter.byte_offset(); @@ -164,7 +177,10 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { (docid, vec![InnerDocOp::Deletion]), ); } - Some((_, offsets)) => offsets.push(InnerDocOp::Deletion), + Some((_, offsets)) => { + offsets.clear(); + offsets.push(InnerDocOp::Deletion); + } } } } @@ -176,10 +192,13 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone let mut docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect(); // Reorder the offsets to make sure we iterate on the file sequentially - match self.index_documents_method { - Idm::ReplaceDocuments => MergeDocumentForReplacement::sort(&mut docids_version_offsets), - Idm::UpdateDocuments => MergeDocumentForUpdates::sort(&mut docids_version_offsets), - } + let sort_function_key = match self.index_documents_method { + Idm::ReplaceDocuments => MergeDocumentForReplacement::sort_key, + Idm::UpdateDocuments => MergeDocumentForUpdates::sort_key, + }; + + // And finally sort them + docids_version_offsets.sort_unstable_by_key(|(_, (_, docops))| sort_function_key(docops)); Ok(docids_version_offsets .into_par_iter() @@ -208,8 +227,11 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { } trait MergeChanges { - /// Reorders the offsets to make sure we iterate on the file sequentially. - fn sort(changes_offsets: &mut [(CowStr, (DocumentId, Vec))]); + /// Wether the payloads in the list of operations are useless or not. + const USELESS_PREVIOUS_CHANGES: bool; + + /// Returns a key that is used to order the payloads the right way. + fn sort_key(docops: &[InnerDocOp]) -> usize; fn merge( rtxn: &RoTxn, @@ -224,18 +246,15 @@ trait MergeChanges { struct MergeDocumentForReplacement; impl MergeChanges for MergeDocumentForReplacement { + const USELESS_PREVIOUS_CHANGES: bool = true; + /// Reorders to read only the last change. - fn sort(changes_offsets: &mut [(CowStr, (DocumentId, Vec))]) { - changes_offsets.sort_unstable_by_key(|(_, (_, offsets))| { - offsets - .iter() - .rev() - .find_map(|ido| match ido { - InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), - InnerDocOp::Deletion => None, - }) - .unwrap_or(0) - }); + fn sort_key(docops: &[InnerDocOp]) -> usize { + let f = |ido: &_| match ido { + InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), + InnerDocOp::Deletion => None, + }; + docops.iter().rev().find_map(f).unwrap_or(0) } /// Returns only the most recent version of a document based on the updates from the payloads. @@ -295,17 +314,15 @@ impl MergeChanges for MergeDocumentForReplacement { struct MergeDocumentForUpdates; impl MergeChanges for MergeDocumentForUpdates { + const USELESS_PREVIOUS_CHANGES: bool = false; + /// Reorders to read the first changes first so that it's faster to read the first one and then the rest. - fn sort(changes_offsets: &mut [(CowStr, (DocumentId, Vec))]) { - changes_offsets.sort_unstable_by_key(|(_, (_, offsets))| { - offsets - .iter() - .find_map(|ido| match ido { - InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), - InnerDocOp::Deletion => None, - }) - .unwrap_or(0) - }); + fn sort_key(docops: &[InnerDocOp]) -> usize { + let f = |ido: &_| match ido { + InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), + InnerDocOp::Deletion => None, + }; + docops.iter().find_map(f).unwrap_or(0) } /// Reads the previous version of a document from the database, the new versions diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 0273d4fe2..c1bcd20cf 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -1,4 +1,3 @@ -use std::fs::File; use std::sync::RwLock; use std::thread::{self, Builder}; @@ -17,12 +16,10 @@ use super::document_change::DocumentChange; use super::extract::*; use super::merger::merge_grenad_entries; use super::StdResult; -use crate::documents::{ - obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY, -}; +use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::update::new::channel::{DatabaseType, ExtractorSender}; use crate::update::GrenadParameters; -use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, UserError}; +use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; mod document_deletion; mod document_operation; From f7652186e171fdbabe06714e1a6f5d629ca97194 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 12 Sep 2024 18:01:02 +0200 Subject: [PATCH 068/247] WIP geo fields --- milli/src/update/new/channel.rs | 54 ++++++++---- milli/src/update/new/document_change.rs | 22 ++--- .../update/new/indexer/document_deletion.rs | 21 +---- .../update/new/indexer/document_operation.rs | 24 +++--- milli/src/update/new/indexer/mod.rs | 18 +++- milli/src/update/new/indexer/partial_dump.rs | 2 +- milli/src/update/new/merger.rs | 82 ++++++++++++++++++- 7 files changed, 152 insertions(+), 71 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 3eafb7754..9b05c7ce4 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -323,6 +323,7 @@ pub enum MergerOperation { WordPositionDocidsMerger(Merger), DeleteDocument { docid: DocumentId }, InsertDocument { docid: DocumentId, document: Box }, + FinishedDocument, } pub struct MergerReceiver(Receiver); @@ -339,22 +340,8 @@ impl IntoIterator for MergerReceiver { pub struct ExtractorSender(Sender); impl ExtractorSender { - pub fn document_insert( - &self, - docid: DocumentId, - document: Box, - ) -> StdResult<(), SendError<()>> { - match self.0.send(MergerOperation::InsertDocument { docid, document }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } - - pub fn document_delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { - match self.0.send(MergerOperation::DeleteDocument { docid }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + pub fn document_sender(&self) -> DocumentSender<'_> { + DocumentSender(&self.0) } pub fn send_searchable( @@ -367,3 +354,38 @@ impl ExtractorSender { } } } + +pub struct DocumentSender<'a>(&'a Sender); + +impl DocumentSender<'_> { + pub fn insert( + &self, + docid: DocumentId, + document: Box, + ) -> StdResult<(), SendError<()>> { + match self.0.send(MergerOperation::InsertDocument { docid, document }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { + match self.0.send(MergerOperation::DeleteDocument { docid }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + pub fn finish(self) -> StdResult<(), SendError<()>> { + match self.0.send(MergerOperation::FinishedDocument) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } +} + +impl Drop for DocumentSender<'_> { + fn drop(&mut self) { + self.0.send(MergerOperation::FinishedDocument); + } +} diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index aa37593c9..3e6473e77 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -12,20 +12,17 @@ pub enum DocumentChange { pub struct Deletion { docid: DocumentId, - external_docid: String, // ? - current: Box, // ? + current: Box, } pub struct Update { docid: DocumentId, - external_docid: String, // ? - current: Box, // ? + current: Box, new: Box, } pub struct Insertion { docid: DocumentId, - external_docid: String, // ? new: Box, } @@ -40,12 +37,8 @@ impl DocumentChange { } impl Deletion { - pub fn create( - docid: DocumentId, - external_docid: String, - current: Box, - ) -> Self { - Self { docid, external_docid, current } + pub fn create(docid: DocumentId, current: Box) -> Self { + Self { docid, current } } pub fn docid(&self) -> DocumentId { @@ -63,8 +56,8 @@ impl Deletion { } impl Insertion { - pub fn create(docid: DocumentId, external_docid: String, new: Box) -> Self { - Insertion { docid, external_docid, new } + pub fn create(docid: DocumentId, new: Box) -> Self { + Insertion { docid, new } } pub fn docid(&self) -> DocumentId { @@ -79,11 +72,10 @@ impl Insertion { impl Update { pub fn create( docid: DocumentId, - external_docid: String, current: Box, new: Box, ) -> Self { - Update { docid, external_docid, current, new } + Update { docid, current, new } } pub fn docid(&self) -> DocumentId { diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index b4336c14a..b744ec65e 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -4,9 +4,8 @@ use rayon::iter::{ParallelBridge, ParallelIterator}; use roaring::RoaringBitmap; use super::DocumentChanges; -use crate::documents::PrimaryKey; use crate::update::new::{Deletion, DocumentChange, ItemsPool}; -use crate::{FieldsIdsMap, Index, InternalError, Result}; +use crate::{FieldsIdsMap, Index, Result}; pub struct DocumentDeletion { pub to_delete: RoaringBitmap, @@ -23,31 +22,19 @@ impl DocumentDeletion { } impl<'p> DocumentChanges<'p> for DocumentDeletion { - type Parameter = (&'p Index, &'p FieldsIdsMap, &'p PrimaryKey<'p>); + type Parameter = &'p Index; fn document_changes( self, _fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, ) -> Result> + Clone + 'p> { - let (index, fields, primary_key) = param; + let index = param; let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| { items.with(|rtxn| { let current = index.document(rtxn, docid)?; - let external_docid = match primary_key.document_id(current, fields)? { - Ok(document_id) => Ok(document_id) as Result<_>, - Err(_) => Err(InternalError::DocumentsError( - crate::documents::Error::InvalidDocumentFormat, - ) - .into()), - }?; - - Ok(DocumentChange::Deletion(Deletion::create( - docid, - external_docid, - current.boxed(), - ))) + Ok(DocumentChange::Deletion(Deletion::create(docid, current.boxed()))) }) })) } diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 1670c8145..b299124bd 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -6,7 +6,6 @@ use heed::types::Bytes; use heed::RoTxn; use memmap2::Mmap; use rayon::iter::{IntoParallelIterator, ParallelIterator}; -use serde_json::from_str; use IndexDocumentsMethod as Idm; use super::super::document_change::DocumentChange; @@ -289,20 +288,17 @@ impl MergeChanges for MergeDocumentForReplacement { let new = writer.into_boxed(); match current { - Some(current) => { - let update = Update::create(docid, external_docid, current.boxed(), new); - Ok(Some(DocumentChange::Update(update))) - } - None => { - let insertion = Insertion::create(docid, external_docid, new); - Ok(Some(DocumentChange::Insertion(insertion))) - } + Some(current) => Ok(Some(DocumentChange::Update(Update::create( + docid, + current.boxed(), + new, + )))), + None => Ok(Some(DocumentChange::Insertion(Insertion::create(docid, new)))), } } Some(InnerDocOp::Deletion) => match current { Some(current) => { - let deletion = Deletion::create(docid, external_docid, current.boxed()); - Ok(Some(DocumentChange::Deletion(deletion))) + Ok(Some(DocumentChange::Deletion(Deletion::create(docid, current.boxed())))) } None => Ok(None), }, @@ -361,7 +357,7 @@ impl MergeChanges for MergeDocumentForUpdates { if operations.is_empty() { match current { Some(current) => { - let deletion = Deletion::create(docid, external_docid, current.boxed()); + let deletion = Deletion::create(docid, current.boxed()); return Ok(Some(DocumentChange::Deletion(deletion))); } None => return Ok(None), @@ -389,11 +385,11 @@ impl MergeChanges for MergeDocumentForUpdates { match current { Some(current) => { - let update = Update::create(docid, external_docid, current.boxed(), new); + let update = Update::create(docid, current.boxed(), new); Ok(Some(DocumentChange::Update(update))) } None => { - let insertion = Insertion::create(docid, external_docid, new); + let insertion = Insertion::create(docid, new); Ok(Some(DocumentChange::Insertion(insertion))) } } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index c1bcd20cf..7350d9499 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -59,6 +59,7 @@ where let fields_ids_map_lock = RwLock::new(fields_ids_map); let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); + let global_fields_ids_map_clone = global_fields_ids_map.clone(); thread::scope(|s| { // TODO manage the errors correctly @@ -70,27 +71,30 @@ where let document_changes = document_changes.into_par_iter(); // document but we need to create a function that collects and compresses documents. + let document_sender = extractor_sender.document_sender(); document_changes.clone().into_par_iter().try_for_each(|result| { match result? { DocumentChange::Deletion(deletion) => { let docid = deletion.docid(); - extractor_sender.document_delete(docid).unwrap(); + document_sender.delete(docid).unwrap(); } DocumentChange::Update(update) => { let docid = update.docid(); let content = update.new(); - extractor_sender.document_insert(docid, content.boxed()).unwrap(); + document_sender.insert(docid, content.boxed()).unwrap(); } DocumentChange::Insertion(insertion) => { let docid = insertion.docid(); let content = insertion.new(); - extractor_sender.document_insert(docid, content.boxed()).unwrap(); + document_sender.insert(docid, content.boxed()).unwrap(); // extracted_dictionary_sender.send(self, dictionary: &[u8]); } } Ok(()) as Result<_> })?; + document_sender.finish().unwrap(); + const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; let max_memory = TEN_GIB / dbg!(rayon::current_num_threads()); let grenad_parameters = GrenadParameters { @@ -197,7 +201,13 @@ where tracing::trace_span!(target: "indexing::documents", parent: ¤t_span, "merge"); let _entered = span.enter(); let rtxn = index.read_txn().unwrap(); - merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index) + merge_grenad_entries( + merger_receiver, + merger_sender, + &rtxn, + index, + global_fields_ids_map_clone, + ) })?; for operation in writer_receiver { diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index fe49ffdd7..50768ba82 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -68,7 +68,7 @@ where } }?; - let insertion = Insertion::create(docid, external_docid, document); + let insertion = Insertion::create(docid, document); Ok(DocumentChange::Insertion(insertion)) })) } diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 22c4baf26..291f79216 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -1,20 +1,23 @@ use std::fs::File; +use std::io::{self, BufWriter}; +use bincode::ErrorKind; use fst::{Set, SetBuilder}; use grenad::Merger; use heed::types::Bytes; -use heed::{Database, RoTxn}; +use heed::{BoxedError, Database, RoTxn}; use memmap2::Mmap; use roaring::RoaringBitmap; -use std::io::BufWriter; use tempfile::tempfile; use super::channel::*; -use super::KvReaderDelAdd; +use super::{Deletion, DocumentChange, Insertion, KvReaderDelAdd, KvReaderFieldId, Update}; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{CboRoaringBitmapCodec, Index, Result}; +use crate::{ + CboRoaringBitmapCodec, Error, GeoPoint, GlobalFieldsIdsMap, Index, InternalError, Result, +}; /// TODO We must return some infos/stats #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")] @@ -23,9 +26,11 @@ pub fn merge_grenad_entries( sender: MergerSender, rtxn: &RoTxn, index: &Index, + mut global_fields_ids_map: GlobalFieldsIdsMap<'_>, ) -> Result<()> { let mut buffer = Vec::new(); let mut documents_ids = index.documents_ids(rtxn)?; + let mut geo_extractor = GeoExtractor::new(rtxn, index)?; for merger_operation in receiver { match merger_operation { @@ -125,6 +130,18 @@ pub fn merge_grenad_entries( let _entered = span.enter(); documents_ids.insert(docid); sender.documents().uncompressed(docid, &document).unwrap(); + + if let Some(geo_extractor) = geo_extractor.as_mut() { + let current = index.documents.remap_data_type::().get(rtxn, &docid)?; + let current: Option<&KvReaderFieldId> = current.map(Into::into); + let change = match current { + Some(current) => { + DocumentChange::Update(Update::create(docid, current.boxed(), document)) + } + None => DocumentChange::Insertion(Insertion::create(docid, document)), + }; + geo_extractor.manage_change(&mut global_fields_ids_map, &change)?; + } } MergerOperation::DeleteDocument { docid } => { let span = @@ -134,6 +151,15 @@ pub fn merge_grenad_entries( unreachable!("Tried deleting a document that we do not know about"); } sender.documents().delete(docid).unwrap(); + + if let Some(geo_extractor) = geo_extractor.as_mut() { + let current = index.document(rtxn, docid)?; + let change = DocumentChange::Deletion(Deletion::create(docid, current.boxed())); + geo_extractor.manage_change(&mut global_fields_ids_map, &change)?; + } + } + MergerOperation::FinishedDocument => { + // send the rtree } } } @@ -153,6 +179,54 @@ pub fn merge_grenad_entries( Ok(()) } +pub struct GeoExtractor { + rtree: Option>, +} + +impl GeoExtractor { + pub fn new(rtxn: &RoTxn, index: &Index) -> Result> { + let is_sortable = index.sortable_fields(rtxn)?.contains("_geo"); + let is_filterable = index.filterable_fields(rtxn)?.contains("_geo"); + if is_sortable || is_filterable { + Ok(Some(GeoExtractor { rtree: index.geo_rtree(rtxn)? })) + } else { + Ok(None) + } + } + + pub fn manage_change( + &mut self, + fidmap: &mut GlobalFieldsIdsMap, + change: &DocumentChange, + ) -> Result<()> { + match change { + DocumentChange::Deletion(_) => todo!(), + DocumentChange::Update(_) => todo!(), + DocumentChange::Insertion(_) => todo!(), + } + } + + pub fn serialize_rtree(self, writer: &mut W) -> Result { + match self.rtree { + Some(rtree) => { + // TODO What should I do? + bincode::serialize_into(writer, &rtree).map(|_| true).map_err(|e| match *e { + ErrorKind::Io(e) => Error::IoError(e), + ErrorKind::InvalidUtf8Encoding(_) => todo!(), + ErrorKind::InvalidBoolEncoding(_) => todo!(), + ErrorKind::InvalidCharEncoding => todo!(), + ErrorKind::InvalidTagEncoding(_) => todo!(), + ErrorKind::DeserializeAnyNotSupported => todo!(), + ErrorKind::SizeLimit => todo!(), + ErrorKind::SequenceMustHaveLength => todo!(), + ErrorKind::Custom(_) => todo!(), + }) + } + None => Ok(false), + } + } +} + fn compute_new_words_fst( add_words_fst: SetBuilder>, del_words_fst: SetBuilder>, From 7ba49b849e05da1ec1210a288b458d6fe288e8f6 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 16 Sep 2024 09:34:10 +0200 Subject: [PATCH 069/247] Extract and write facet databases --- milli/src/update/new/channel.rs | 103 +++++- .../new/extract/faceted/extract_facets.rs | 342 ++++++++++++------ milli/src/update/new/extract/faceted/mod.rs | 196 ++-------- milli/src/update/new/extract/mod.rs | 28 ++ .../src/update/new/extract/searchable/mod.rs | 12 + milli/src/update/new/indexer/mod.rs | 36 +- milli/src/update/new/merger.rs | 130 ++++++- 7 files changed, 526 insertions(+), 321 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 9b05c7ce4..98538ea9e 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -6,6 +6,7 @@ use grenad::Merger; use heed::types::Bytes; use memmap2::Mmap; +use super::extract::FacetKind; use super::StdResult; use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY}; use crate::update::new::KvReaderFieldId; @@ -120,11 +121,16 @@ pub enum Database { WordFidDocids, WordPairProximityDocids, WordPositionDocids, + FacetIdIsNullDocids, + FacetIdIsEmptyDocids, + FacetIdExistsDocids, + FacetIdF64NumberDocids, + FacetIdStringDocids, } -impl WriterOperation { +impl Database { pub fn database(&self, index: &Index) -> heed::Database { - match self.database { + match self { Database::Documents => index.documents.remap_types(), Database::ExactWordDocids => index.exact_word_docids.remap_types(), Database::Main => index.main.remap_types(), @@ -133,8 +139,19 @@ impl WriterOperation { Database::WordPositionDocids => index.word_position_docids.remap_types(), Database::FidWordCountDocids => index.field_id_word_count_docids.remap_types(), Database::WordPairProximityDocids => index.word_pair_proximity_docids.remap_types(), + Database::FacetIdIsNullDocids => index.facet_id_is_null_docids.remap_types(), + Database::FacetIdIsEmptyDocids => index.facet_id_is_empty_docids.remap_types(), + Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(), + Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(), + Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), } } +} + +impl WriterOperation { + pub fn database(&self, index: &Index) -> heed::Database { + self.database.database(index) + } pub fn entry(self) -> EntryOperation { self.entry @@ -159,8 +176,12 @@ impl MergerSender { MainSender(&self.0) } - pub fn docids(&self) -> DocidsSender<'_, D> { - DocidsSender { sender: &self.0, _marker: PhantomData } + pub fn docids(&self) -> WordDocidsSender<'_, D> { + WordDocidsSender { sender: &self.0, _marker: PhantomData } + } + + pub fn facet_docids(&self) -> FacetDocidsSender<'_> { + FacetDocidsSender { sender: &self.0 } } pub fn documents(&self) -> DocumentsSender<'_> { @@ -208,16 +229,21 @@ pub enum WordDocids {} pub enum WordFidDocids {} pub enum WordPairProximityDocids {} pub enum WordPositionDocids {} +pub enum FacetDocids {} pub trait DatabaseType { const DATABASE: Database; +} +pub trait MergerOperationType { fn new_merger_operation(merger: Merger) -> MergerOperation; } impl DatabaseType for ExactWordDocids { const DATABASE: Database = Database::ExactWordDocids; +} +impl MergerOperationType for ExactWordDocids { fn new_merger_operation(merger: Merger) -> MergerOperation { MergerOperation::ExactWordDocidsMerger(merger) } @@ -225,7 +251,9 @@ impl DatabaseType for ExactWordDocids { impl DatabaseType for FidWordCountDocids { const DATABASE: Database = Database::FidWordCountDocids; +} +impl MergerOperationType for FidWordCountDocids { fn new_merger_operation(merger: Merger) -> MergerOperation { MergerOperation::FidWordCountDocidsMerger(merger) } @@ -233,7 +261,9 @@ impl DatabaseType for FidWordCountDocids { impl DatabaseType for WordDocids { const DATABASE: Database = Database::WordDocids; +} +impl MergerOperationType for WordDocids { fn new_merger_operation(merger: Merger) -> MergerOperation { MergerOperation::WordDocidsMerger(merger) } @@ -241,7 +271,9 @@ impl DatabaseType for WordDocids { impl DatabaseType for WordFidDocids { const DATABASE: Database = Database::WordFidDocids; +} +impl MergerOperationType for WordFidDocids { fn new_merger_operation(merger: Merger) -> MergerOperation { MergerOperation::WordFidDocidsMerger(merger) } @@ -249,7 +281,9 @@ impl DatabaseType for WordFidDocids { impl DatabaseType for WordPairProximityDocids { const DATABASE: Database = Database::WordPairProximityDocids; +} +impl MergerOperationType for WordPairProximityDocids { fn new_merger_operation(merger: Merger) -> MergerOperation { MergerOperation::WordPairProximityDocidsMerger(merger) } @@ -257,19 +291,32 @@ impl DatabaseType for WordPairProximityDocids { impl DatabaseType for WordPositionDocids { const DATABASE: Database = Database::WordPositionDocids; +} +impl MergerOperationType for WordPositionDocids { fn new_merger_operation(merger: Merger) -> MergerOperation { MergerOperation::WordPositionDocidsMerger(merger) } } -pub struct DocidsSender<'a, D> { +impl MergerOperationType for FacetDocids { + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::FacetDocidsMerger(merger) + } +} + +pub trait DocidsSender { + fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>; + fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>; +} + +pub struct WordDocidsSender<'a, D> { sender: &'a Sender, _marker: PhantomData, } -impl DocidsSender<'_, D> { - pub fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { +impl DocidsSender for WordDocidsSender<'_, D> { + fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); match self.sender.send(WriterOperation { database: D::DATABASE, entry }) { Ok(()) => Ok(()), @@ -277,7 +324,7 @@ impl DocidsSender<'_, D> { } } - pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Delete(KeyEntry::from_key(key)); match self.sender.send(WriterOperation { database: D::DATABASE, entry }) { Ok(()) => Ok(()), @@ -286,6 +333,43 @@ impl DocidsSender<'_, D> { } } +pub struct FacetDocidsSender<'a> { + sender: &'a Sender, +} + +impl DocidsSender for FacetDocidsSender<'_> { + fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { + let (database, key) = self.extract_database(key); + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); + match self.sender.send(WriterOperation { database, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + let (database, key) = self.extract_database(key); + let entry = EntryOperation::Delete(KeyEntry::from_key(key)); + match self.sender.send(WriterOperation { database, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } +} + +impl FacetDocidsSender<'_> { + fn extract_database<'a>(&self, key: &'a [u8]) -> (Database, &'a [u8]) { + let database = match FacetKind::from(key[0]) { + FacetKind::Number => Database::FacetIdF64NumberDocids, + FacetKind::String => Database::FacetIdStringDocids, + FacetKind::Null => Database::FacetIdIsNullDocids, + FacetKind::Empty => Database::FacetIdIsEmptyDocids, + FacetKind::Exists => Database::FacetIdExistsDocids, + }; + (database, &key[1..]) + } +} + pub struct DocumentsSender<'a>(&'a Sender); impl DocumentsSender<'_> { @@ -321,6 +405,7 @@ pub enum MergerOperation { WordFidDocidsMerger(Merger), WordPairProximityDocidsMerger(Merger), WordPositionDocidsMerger(Merger), + FacetDocidsMerger(Merger), DeleteDocument { docid: DocumentId }, InsertDocument { docid: DocumentId, document: Box }, FinishedDocument, @@ -344,7 +429,7 @@ impl ExtractorSender { DocumentSender(&self.0) } - pub fn send_searchable( + pub fn send_searchable( &self, merger: Merger, ) -> StdResult<(), SendError<()>> { diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 9471c753b..41bce2215 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -1,61 +1,180 @@ use std::collections::HashSet; +use std::fmt::Debug; +use std::fs::File; +use grenad::{MergeFunction, Merger}; use heed::RoTxn; +use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; use serde_json::Value; -use super::FacetedExtractor; +use super::super::cache::CboCachedSorter; +use super::facet_document::extract_document_facets; +use super::FacetKind; use crate::facet::value_encoding::f64_into_bytes; -use crate::{normalize_facet, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; +use crate::update::new::extract::DocidsExtractor; +use crate::update::new::{DocumentChange, ItemsPool}; +use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::{DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result, MAX_FACET_VALUE_LENGTH}; +pub struct FacetedDocidsExtractor; -pub struct FieldIdFacetNumberDocidsExtractor; +impl FacetedDocidsExtractor { + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + buffer: &mut Vec, + fields_ids_map: &mut GlobalFieldsIdsMap, + attributes_to_extract: &[&str], + cached_sorter: &mut CboCachedSorter, + document_change: DocumentChange, + ) -> Result<()> { + match document_change { + DocumentChange::Deletion(inner) => extract_document_facets( + attributes_to_extract, + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_del_u32, + inner.docid(), + fid, + value, + ) + }, + ), + DocumentChange::Update(inner) => { + extract_document_facets( + attributes_to_extract, + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_del_u32, + inner.docid(), + fid, + value, + ) + }, + )?; + + extract_document_facets( + attributes_to_extract, + inner.new(), + fields_ids_map, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_add_u32, + inner.docid(), + fid, + value, + ) + }, + ) + } + DocumentChange::Insertion(inner) => extract_document_facets( + attributes_to_extract, + inner.new(), + fields_ids_map, + &mut |fid, value| { + Self::facet_fn_with_options( + buffer, + cached_sorter, + CboCachedSorter::insert_add_u32, + inner.docid(), + fid, + value, + ) + }, + ), + } + } + + fn facet_fn_with_options( + buffer: &mut Vec, + cached_sorter: &mut CboCachedSorter, + cache_fn: impl Fn(&mut CboCachedSorter, &[u8], u32) -> grenad::Result<(), MF::Error>, + docid: DocumentId, + fid: FieldId, + value: &Value, + ) -> Result<()> + where + MF: MergeFunction, + MF::Error: Debug, + grenad::Error: Into, + { + // Exists + // key: fid + buffer.clear(); + buffer.push(FacetKind::Exists as u8); + buffer.extend_from_slice(&fid.to_be_bytes()); + cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into)?; + + match value { + // Number + // key: fid - level - orderedf64 - orignalf64 + Value::Number(number) => { + if let Some((n, ordered)) = + number.as_f64().and_then(|n| f64_into_bytes(n).map(|ordered| (n, ordered))) + { + buffer.clear(); + buffer.push(FacetKind::Number as u8); + buffer.extend_from_slice(&fid.to_be_bytes()); + buffer.push(1); // level 0 + buffer.extend_from_slice(&ordered); + buffer.extend_from_slice(&n.to_be_bytes()); + + cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) + } else { + Ok(()) + } + } + // String + // key: fid - level - truncated_string + Value::String(s) => { + let truncated = truncate_str(s); + buffer.clear(); + buffer.push(FacetKind::String as u8); + buffer.extend_from_slice(&fid.to_be_bytes()); + buffer.push(1); // level 0 + buffer.extend_from_slice(truncated.as_bytes()); + cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) + } + // Null + // key: fid + Value::Null => { + buffer.clear(); + buffer.push(FacetKind::Null as u8); + buffer.extend_from_slice(&fid.to_be_bytes()); + cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) + } + // Empty + // key: fid + Value::Array(a) if a.is_empty() => { + buffer.clear(); + buffer.push(FacetKind::Empty as u8); + buffer.extend_from_slice(&fid.to_be_bytes()); + cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) + } + Value::Object(o) if o.is_empty() => { + buffer.clear(); + buffer.push(FacetKind::Empty as u8); + buffer.extend_from_slice(&fid.to_be_bytes()); + cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) + } + // Otherwise, do nothing + /// TODO: What about Value::Bool? + _ => Ok(()), + } + } -impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor { fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { index.user_defined_faceted_fields(rtxn) } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - let number = value.as_number()?; - let n = number.as_f64()?; - let ordered = f64_into_bytes(n)?; - - // fid - level - orderedf64 - orignalf64 - output.extend_from_slice(&field_id.to_be_bytes()); - output.push(1); // level 0 - output.extend_from_slice(&ordered); - output.extend_from_slice(&n.to_be_bytes()); - - Some(&*output) - } -} - -pub struct FieldIdFacetStringDocidsExtractor; - -impl FacetedExtractor for FieldIdFacetStringDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - let string = value.as_str()?; - let normalize = normalize_facet(string); - let truncated = truncate_str(&normalize); - - // fid - level - normalized string - output.extend_from_slice(&field_id.to_be_bytes()); - output.push(1); // level 0 - output.extend_from_slice(truncated.as_bytes()); - - Some(&*output) - } } /// Truncates a string to the biggest valid LMDB key size. @@ -70,68 +189,77 @@ fn truncate_str(s: &str) -> &str { &s[..index.unwrap_or(0)] } -pub struct FieldIdFacetIsNullDocidsExtractor; +impl DocidsExtractor for FacetedDocidsExtractor { + #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] + fn run_extraction( + index: &Index, + fields_ids_map: &GlobalFieldsIdsMap, + indexer: GrenadParameters, + document_changes: impl IntoParallelIterator>, + ) -> Result> { + let max_memory = indexer.max_memory_by_thread(); -impl FacetedExtractor for FieldIdFacetIsNullDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } + let rtxn = index.read_txn()?; + let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; + let attributes_to_extract: Vec<_> = + attributes_to_extract.iter().map(|s| s.as_ref()).collect(); - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - if value.is_null() { - output.extend_from_slice(&field_id.to_be_bytes()); - Some(&*output) - } else { - None - } - } -} - -pub struct FieldIdFacetExistsDocidsExtractor; - -impl FacetedExtractor for FieldIdFacetExistsDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - _value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - output.extend_from_slice(&field_id.to_be_bytes()); - Some(&*output) - } -} - -pub struct FieldIdFacetIsEmptyDocidsExtractor; - -impl FacetedExtractor for FieldIdFacetIsEmptyDocidsExtractor { - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } - - fn build_key<'b>( - field_id: FieldId, - value: &Value, - output: &'b mut Vec, - ) -> Option<&'b [u8]> { - let is_empty = match value { - Value::Null | Value::Bool(_) | Value::Number(_) => false, - Value::String(s) => s.is_empty(), - Value::Array(a) => a.is_empty(), - Value::Object(o) => o.is_empty(), - }; - - if is_empty { - output.extend_from_slice(&field_id.to_be_bytes()); - Some(&*output) - } else { - None + let context_pool = ItemsPool::new(|| { + Ok(( + index.read_txn()?, + fields_ids_map.clone(), + Vec::new(), + CboCachedSorter::new( + // TODO use a better value + 100.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + ), + )) + }); + + { + let span = + tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); + let _entered = span.enter(); + document_changes.into_par_iter().try_for_each(|document_change| { + context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + buffer, + fields_ids_map, + &attributes_to_extract, + cached_sorter, + document_change?, + ) + }) + })?; + } + { + let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); + let span = + tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); + let _entered = span.enter(); + + let readers: Vec<_> = context_pool + .into_items() + .par_bridge() + .map(|(_rtxn, _tokenizer, _fields_ids_map, cached_sorter)| { + let sorter = cached_sorter.into_sorter()?; + sorter.into_reader_cursors() + }) + .collect(); + for reader in readers { + builder.extend(reader?); + } + Ok(builder.build()) } } } diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index b4d6b4131..a59c64d9a 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -1,180 +1,26 @@ -use std::collections::HashSet; -use std::fmt::Debug; -use std::fs::File; - -pub use extract_facets::*; -use grenad::{MergeFunction, Merger}; -use heed::RoTxn; -use rayon::iter::{IntoParallelIterator, ParallelIterator}; -use serde_json::Value; - -use super::cache::CboCachedSorter; -use crate::update::new::{DocumentChange, ItemsPool}; -use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result}; - mod extract_facets; mod facet_document; -pub trait FacetedExtractor { - #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] - fn run_extraction( - index: &Index, - fields_ids_map: &GlobalFieldsIdsMap, - indexer: GrenadParameters, - document_changes: impl IntoParallelIterator>, - ) -> Result> { - let max_memory = indexer.max_memory_by_thread(); +pub use extract_facets::FacetedDocidsExtractor; - let rtxn = index.read_txn()?; - let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; - let attributes_to_extract: Vec<_> = - attributes_to_extract.iter().map(|s| s.as_ref()).collect(); - - let context_pool = ItemsPool::new(|| { - Ok(( - index.read_txn()?, - fields_ids_map.clone(), - Vec::new(), - CboCachedSorter::new( - // TODO use a better value - 100.try_into().unwrap(), - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ), - ), - )) - }); - - document_changes.into_par_iter().try_for_each(|document_change| { - context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| { - Self::extract_document_change( - &*rtxn, - index, - buffer, - fields_ids_map, - &attributes_to_extract, - cached_sorter, - document_change?, - ) - }) - })?; - - let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); - for (_rtxn, _fields_ids_map, _buffer, cache) in context_pool.into_items() { - let sorter = cache.into_sorter()?; - let readers = sorter.into_reader_cursors()?; - builder.extend(readers); - } - - Ok(builder.build()) - } - - // TODO Shorten this - fn facet_fn_with_options( - buffer: &mut Vec, - cached_sorter: &mut CboCachedSorter, - cache_fn: impl Fn(&mut CboCachedSorter, &[u8], u32) -> grenad::Result<(), MF::Error>, - docid: DocumentId, - fid: FieldId, - value: &Value, - ) -> Result<()> - where - MF: MergeFunction, - MF::Error: Debug, - grenad::Error: Into, - { - buffer.clear(); - match Self::build_key(fid, value, buffer) { - Some(key) => cache_fn(cached_sorter, &key, docid).map_err(Into::into), - None => Ok(()), - } - } - - fn extract_document_change( - rtxn: &RoTxn, - index: &Index, - buffer: &mut Vec, - fields_ids_map: &mut GlobalFieldsIdsMap, - attributes_to_extract: &[&str], - cached_sorter: &mut CboCachedSorter, - document_change: DocumentChange, - ) -> Result<()> { - match document_change { - DocumentChange::Deletion(inner) => facet_document::extract_document_facets( - attributes_to_extract, - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut |fid, value| { - Self::facet_fn_with_options( - buffer, - cached_sorter, - CboCachedSorter::insert_del_u32, - inner.docid(), - fid, - value, - ) - }, - ), - DocumentChange::Update(inner) => { - facet_document::extract_document_facets( - attributes_to_extract, - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut |fid, value| { - Self::facet_fn_with_options( - buffer, - cached_sorter, - CboCachedSorter::insert_del_u32, - inner.docid(), - fid, - value, - ) - }, - )?; - - facet_document::extract_document_facets( - attributes_to_extract, - inner.new(), - fields_ids_map, - &mut |fid, value| { - Self::facet_fn_with_options( - buffer, - cached_sorter, - CboCachedSorter::insert_add_u32, - inner.docid(), - fid, - value, - ) - }, - ) - } - DocumentChange::Insertion(inner) => facet_document::extract_document_facets( - attributes_to_extract, - inner.new(), - fields_ids_map, - &mut |fid, value| { - Self::facet_fn_with_options( - buffer, - cached_sorter, - CboCachedSorter::insert_add_u32, - inner.docid(), - fid, - value, - ) - }, - ), - } - } - - // TODO avoid owning the strings here. - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; - - fn build_key<'b>(field_id: FieldId, value: &Value, output: &'b mut Vec) - -> Option<&'b [u8]>; +#[repr(u8)] +pub enum FacetKind { + Number = 0, + String = 1, + Null = 2, + Empty = 3, + Exists, +} + +impl From for FacetKind { + fn from(value: u8) -> Self { + match value { + 0 => Self::Number, + 1 => Self::String, + 2 => Self::Null, + 3 => Self::Empty, + 4 => Self::Exists, + _ => unreachable!(), + } + } } diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index d6d5a3005..3836f9957 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -2,9 +2,29 @@ mod cache; mod faceted; mod searchable; +use std::fs::File; + pub use faceted::*; +use grenad::Merger; +use rayon::iter::IntoParallelIterator; pub use searchable::*; +use crate::{ + update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}, + GlobalFieldsIdsMap, Index, Result, +}; + +use super::DocumentChange; + +pub trait DocidsExtractor { + fn run_extraction( + index: &Index, + fields_ids_map: &GlobalFieldsIdsMap, + indexer: GrenadParameters, + document_changes: impl IntoParallelIterator>, + ) -> Result>; +} + /// TODO move in permissive json pointer pub mod perm_json_p { use serde_json::{Map, Value}; @@ -39,6 +59,10 @@ pub mod perm_json_p { base_key: &str, seeker: &mut impl FnMut(&str, &Value) -> Result<()>, ) -> Result<()> { + if value.is_empty() { + seeker(&base_key, &Value::Object(Map::with_capacity(0)))?; + } + for (key, value) in value.iter() { let base_key = if base_key.is_empty() { key.to_string() @@ -80,6 +104,10 @@ pub mod perm_json_p { base_key: &str, seeker: &mut impl FnMut(&str, &Value) -> Result<()>, ) -> Result<()> { + if values.is_empty() { + seeker(&base_key, &Value::Array(vec![]))?; + } + for value in values { match value { Value::Object(object) => { diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 7e096591e..48d373598 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -17,6 +17,7 @@ use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; +use super::DocidsExtractor; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; @@ -130,3 +131,14 @@ pub trait SearchableExtractor { fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; } + +impl DocidsExtractor for T { + fn run_extraction( + index: &Index, + fields_ids_map: &GlobalFieldsIdsMap, + indexer: GrenadParameters, + document_changes: impl IntoParallelIterator>, + ) -> Result> { + Self::run_extraction(index, fields_ids_map, indexer, document_changes) + } +} diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 7350d9499..b40ddbc4d 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -101,6 +101,22 @@ where max_memory: Some(max_memory), ..GrenadParameters::default() }; + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); + let _entered = span.enter(); + extract_and_send_docids::< + FacetedDocidsExtractor, + FacetDocids, + >( + index, + &global_fields_ids_map, + grenad_parameters, + document_changes.clone(), + &extractor_sender, + )?; + } + { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); let _entered = span.enter(); @@ -176,19 +192,19 @@ where } // TODO THIS IS TOO MUCH - // Extract fieldid docid facet number - // Extract fieldid docid facet string - // Extract facetid string fst - // Extract facetid normalized string strings + // - [ ] Extract fieldid docid facet number + // - [ ] Extract fieldid docid facet string + // - [ ] Extract facetid string fst + // - [ ] Extract facetid normalized string strings // TODO Inverted Indexes again - // Extract fieldid facet isempty docids - // Extract fieldid facet isnull docids - // Extract fieldid facet exists docids + // - [x] Extract fieldid facet isempty docids + // - [x] Extract fieldid facet isnull docids + // - [x] Extract fieldid facet exists docids // TODO This is the normal system - // Extract fieldid facet number docids - // Extract fieldid facet string docids + // - [x] Extract fieldid facet number docids + // - [x] Extract fieldid facet string docids Ok(()) as Result<_> }) @@ -238,7 +254,7 @@ where /// TODO: GrenadParameters::default() should be removed in favor a passed parameter /// TODO: manage the errors correctly /// TODO: we must have a single trait that also gives the extractor type -fn extract_and_send_docids( +fn extract_and_send_docids( index: &Index, fields_ids_map: &GlobalFieldsIdsMap, indexer: GrenadParameters, diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 291f79216..9ba81fb11 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -12,6 +12,7 @@ use tempfile::tempfile; use super::channel::*; use super::{Deletion, DocumentChange, Insertion, KvReaderDelAdd, KvReaderFieldId, Update}; +use super::extract::FacetKind; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::update::MergeDeladdCboRoaringBitmaps; @@ -63,26 +64,33 @@ pub fn merge_grenad_entries( )?; } MergerOperation::WordDocidsMerger(merger) => { - let span = - tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); - let _entered = span.enter(); let mut add_words_fst = SetBuilder::new(BufWriter::new(tempfile()?))?; let mut del_words_fst = SetBuilder::new(BufWriter::new(tempfile()?))?; + { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); + let _entered = span.enter(); - merge_and_send_docids( - merger, - index.word_docids.remap_types(), - rtxn, - &mut buffer, - sender.docids::(), - |key| add_words_fst.insert(key), - |key| del_words_fst.insert(key), - )?; + merge_and_send_docids( + merger, + index.word_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |key| add_words_fst.insert(key), + |key| del_words_fst.insert(key), + )?; + } - // Move that into a dedicated function - let words_fst = index.words_fst(rtxn)?; - let mmap = compute_new_words_fst(add_words_fst, del_words_fst, words_fst)?; - sender.main().write_words_fst(mmap).unwrap(); + { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "words_fst"); + let _entered = span.enter(); + // Move that into a dedicated function + let words_fst = index.words_fst(rtxn)?; + let mmap = compute_new_words_fst(add_words_fst, del_words_fst, words_fst)?; + sender.main().write_words_fst(mmap).unwrap(); + } } MergerOperation::WordFidDocidsMerger(merger) => { let span = @@ -161,6 +169,18 @@ pub fn merge_grenad_entries( MergerOperation::FinishedDocument => { // send the rtree } + MergerOperation::FacetDocidsMerger(merger) => { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "facet_docids"); + let _entered = span.enter(); + merge_and_send_facet_docids( + merger, + FacetDatabases::new(index), + rtxn, + &mut buffer, + sender.facet_docids(), + )?; + } } } @@ -252,12 +272,12 @@ fn compute_new_words_fst( } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -fn merge_and_send_docids( +fn merge_and_send_docids( merger: Merger, database: Database, rtxn: &RoTxn<'_>, buffer: &mut Vec, - word_docids_sender: DocidsSender<'_, D>, + docids_sender: impl DocidsSender, mut add_key: impl FnMut(&[u8]) -> fst::Result<()>, mut del_key: impl FnMut(&[u8]) -> fst::Result<()>, ) -> Result<()> { @@ -271,11 +291,11 @@ fn merge_and_send_docids( match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer); - word_docids_sender.write(key, value).unwrap(); + docids_sender.write(key, value).unwrap(); add_key(key)?; } Operation::Delete => { - word_docids_sender.delete(key).unwrap(); + docids_sender.delete(key).unwrap(); del_key(key)?; } Operation::Ignore => (), @@ -285,6 +305,76 @@ fn merge_and_send_docids( Ok(()) } +#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] +fn merge_and_send_facet_docids( + merger: Merger, + database: FacetDatabases, + rtxn: &RoTxn<'_>, + buffer: &mut Vec, + docids_sender: impl DocidsSender, +) -> Result<()> { + let mut merger_iter = merger.into_stream_merger_iter().unwrap(); + while let Some((key, deladd)) = merger_iter.next().unwrap() { + let current = database.get(rtxn, key)?; + let deladd: &KvReaderDelAdd = deladd.into(); + let del = deladd.get(DelAdd::Deletion); + let add = deladd.get(DelAdd::Addition); + + match merge_cbo_bitmaps(current, del, add)? { + Operation::Write(bitmap) => { + let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer); + docids_sender.write(key, value).unwrap(); + } + Operation::Delete => { + docids_sender.delete(key).unwrap(); + } + Operation::Ignore => (), + } + } + + Ok(()) +} + +struct FacetDatabases { + /// Maps the facet field id and the docids for which this field exists + facet_id_exists_docids: Database, + /// Maps the facet field id and the docids for which this field is set as null + facet_id_is_null_docids: Database, + /// Maps the facet field id and the docids for which this field is considered empty + facet_id_is_empty_docids: Database, + /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. + facet_id_f64_docids: Database, + /// Maps the facet field id and ranges of strings with the docids that corresponds to them. + facet_id_string_docids: Database, +} + +impl FacetDatabases { + fn new(index: &Index) -> Self { + Self { + facet_id_exists_docids: index.facet_id_exists_docids.remap_types(), + facet_id_is_null_docids: index.facet_id_is_null_docids.remap_types(), + facet_id_is_empty_docids: index.facet_id_is_empty_docids.remap_types(), + facet_id_f64_docids: index.facet_id_f64_docids.remap_types(), + facet_id_string_docids: index.facet_id_string_docids.remap_types(), + } + } + + fn get<'a>(&self, rtxn: &'a RoTxn<'_>, key: &[u8]) -> heed::Result> { + let (facet_kind, key) = self.extract_facet_kind(key); + match facet_kind { + FacetKind::Exists => self.facet_id_exists_docids.get(rtxn, key), + FacetKind::Null => self.facet_id_is_null_docids.get(rtxn, key), + FacetKind::Empty => self.facet_id_is_empty_docids.get(rtxn, key), + FacetKind::Number => self.facet_id_f64_docids.get(rtxn, key), + FacetKind::String => self.facet_id_string_docids.get(rtxn, key), + } + } + + fn extract_facet_kind<'a>(&self, key: &'a [u8]) -> (FacetKind, &'a [u8]) { + (FacetKind::from(key[0]), &key[1..]) + } +} + enum Operation { Write(RoaringBitmap), Delete, From f13e076b8ab0e23013b020e173bf5ffd5c7b628f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 16 Sep 2024 14:40:40 +0200 Subject: [PATCH 070/247] Use hashmap instead of Btree in wpp extractor --- .../extract_word_pair_proximity_docids.rs | 88 ++++++++++--------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 7b3706424..82007f9ba 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, VecDeque}; +use std::collections::{HashMap, VecDeque}; use heed::RoTxn; use itertools::merge_join_by; @@ -35,10 +35,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { cached_sorter: &mut CboCachedSorter, document_change: DocumentChange, ) -> Result<()> { - /// TODO: mutualize those buffers let mut key_buffer = Vec::new(); - let mut add_word_pair_proximity = BTreeMap::new(); - let mut del_word_pair_proximity = BTreeMap::new(); + let mut word_pair_proximity = HashMap::new(); let mut word_positions: VecDeque<(String, u16)> = VecDeque::with_capacity(MAX_DISTANCE as usize); @@ -51,7 +49,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { document_tokenizer, fields_ids_map, &mut word_positions, - &mut del_word_pair_proximity, + &mut |(w1, w2), prox| { + word_pair_proximity + .entry((w1, w2)) + .and_modify(|(del_p, _add_p)| { + *del_p = std::cmp::min(*del_p, prox); + }) + .or_insert((prox, 0)); + }, )?; } DocumentChange::Update(inner) => { @@ -61,7 +66,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { document_tokenizer, fields_ids_map, &mut word_positions, - &mut del_word_pair_proximity, + &mut |(w1, w2), prox| { + word_pair_proximity + .entry((w1, w2)) + .and_modify(|(del_p, _add_p)| { + *del_p = std::cmp::min(*del_p, prox); + }) + .or_insert((prox, 0)); + }, )?; let document = inner.new(); process_document_tokens( @@ -69,7 +81,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { document_tokenizer, fields_ids_map, &mut word_positions, - &mut add_word_pair_proximity, + &mut |(w1, w2), prox| { + word_pair_proximity + .entry((w1, w2)) + .and_modify(|(_del_p, add_p)| { + *add_p = std::cmp::min(*add_p, prox); + }) + .or_insert((0, prox)); + }, )?; } DocumentChange::Insertion(inner) => { @@ -79,35 +98,23 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { document_tokenizer, fields_ids_map, &mut word_positions, - &mut add_word_pair_proximity, + &mut |(w1, w2), prox| { + word_pair_proximity + .entry((w1, w2)) + .and_modify(|(_del_p, add_p)| { + *add_p = std::cmp::min(*add_p, prox); + }) + .or_insert((0, prox)); + }, )?; } } - use itertools::EitherOrBoth::*; - for eob in - merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| { - d.cmp(a) - }) - { - match eob { - Left(((w1, w2), prox)) => { - let key = build_key(*prox, w1, w2, &mut key_buffer); - cached_sorter.insert_del_u32(key, docid)?; - } - Right(((w1, w2), prox)) => { - let key = build_key(*prox, w1, w2, &mut key_buffer); - cached_sorter.insert_add_u32(key, docid)?; - } - Both(((w1, w2), del_prox), (_, add_prox)) => { - if del_prox != add_prox { - let key = build_key(*del_prox, w1, w2, &mut key_buffer); - cached_sorter.insert_del_u32(key, docid)?; - let key = build_key(*add_prox, w1, w2, &mut key_buffer); - cached_sorter.insert_add_u32(key, docid)?; - } - } - } + for ((w1, w2), (del_p, add_p)) in word_pair_proximity.iter() { + let key = build_key(*del_p, w1, w2, &mut key_buffer); + cached_sorter.insert_del_u32(key, docid)?; + let key = build_key(*add_p, w1, w2, &mut key_buffer); + cached_sorter.insert_add_u32(key, docid)?; } Ok(()) @@ -125,18 +132,19 @@ fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec) -> & fn word_positions_into_word_pair_proximity( word_positions: &mut VecDeque<(String, u16)>, - word_pair_proximity: &mut BTreeMap<(String, String), u8>, + word_pair_proximity: &mut dyn FnMut((String, String), u8), ) -> Result<()> { let (head_word, head_position) = word_positions.pop_front().unwrap(); for (word, position) in word_positions.iter() { let prox = index_proximity(head_position as u32, *position as u32) as u8; if prox > 0 && prox < MAX_DISTANCE as u8 { - word_pair_proximity - .entry((head_word.clone(), word.clone())) - .and_modify(|p| { - *p = std::cmp::min(*p, prox); - }) - .or_insert(prox); + word_pair_proximity((head_word.clone(), word.clone()), prox); + // word_pair_proximity + // .entry((head_word.clone(), word.clone())) + // .and_modify(|p| { + // *p = std::cmp::min(*p, prox); + // }) + // .or_insert(prox); } } Ok(()) @@ -147,7 +155,7 @@ fn process_document_tokens( document_tokenizer: &DocumentTokenizer, fields_ids_map: &mut GlobalFieldsIdsMap, word_positions: &mut VecDeque<(String, u16)>, - word_pair_proximity: &mut BTreeMap<(String, String), u8>, + word_pair_proximity: &mut dyn FnMut((String, String), u8), ) -> Result<()> { let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| { // drain the proximity window until the head word is considered close to the word we are inserting. From 1a0e96229925a2eed72c1215f84e61454c6b4ca6 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 16 Sep 2024 15:01:20 +0200 Subject: [PATCH 071/247] Replace hashmap by vectors in wpp --- .../extract_word_pair_proximity_docids.rs | 55 ++++++------------- 1 file changed, 18 insertions(+), 37 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 82007f9ba..0386297d1 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -36,7 +36,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { document_change: DocumentChange, ) -> Result<()> { let mut key_buffer = Vec::new(); - let mut word_pair_proximity = HashMap::new(); + let mut del_word_pair_proximity = Vec::new(); + let mut add_word_pair_proximity = Vec::new(); let mut word_positions: VecDeque<(String, u16)> = VecDeque::with_capacity(MAX_DISTANCE as usize); @@ -50,12 +51,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { - word_pair_proximity - .entry((w1, w2)) - .and_modify(|(del_p, _add_p)| { - *del_p = std::cmp::min(*del_p, prox); - }) - .or_insert((prox, 0)); + del_word_pair_proximity.push(((w1, w2), prox)); }, )?; } @@ -67,12 +63,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { - word_pair_proximity - .entry((w1, w2)) - .and_modify(|(del_p, _add_p)| { - *del_p = std::cmp::min(*del_p, prox); - }) - .or_insert((prox, 0)); + del_word_pair_proximity.push(((w1, w2), prox)); }, )?; let document = inner.new(); @@ -82,12 +73,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { - word_pair_proximity - .entry((w1, w2)) - .and_modify(|(_del_p, add_p)| { - *add_p = std::cmp::min(*add_p, prox); - }) - .or_insert((0, prox)); + add_word_pair_proximity.push(((w1, w2), prox)); }, )?; } @@ -99,24 +85,25 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { - word_pair_proximity - .entry((w1, w2)) - .and_modify(|(_del_p, add_p)| { - *add_p = std::cmp::min(*add_p, prox); - }) - .or_insert((0, prox)); + add_word_pair_proximity.push(((w1, w2), prox)); }, )?; } } - for ((w1, w2), (del_p, add_p)) in word_pair_proximity.iter() { - let key = build_key(*del_p, w1, w2, &mut key_buffer); + del_word_pair_proximity.sort_unstable(); + del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2); + for ((w1, w2), prox) in del_word_pair_proximity.iter() { + let key = build_key(*prox, w1, w2, &mut key_buffer); cached_sorter.insert_del_u32(key, docid)?; - let key = build_key(*add_p, w1, w2, &mut key_buffer); - cached_sorter.insert_add_u32(key, docid)?; } + add_word_pair_proximity.sort_unstable(); + add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2); + for ((w1, w2), prox) in add_word_pair_proximity.iter() { + let key = build_key(*prox, w1, w2, &mut key_buffer); + cached_sorter.insert_add_u32(key, docid)?; + } Ok(()) } } @@ -132,19 +119,13 @@ fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec) -> & fn word_positions_into_word_pair_proximity( word_positions: &mut VecDeque<(String, u16)>, - word_pair_proximity: &mut dyn FnMut((String, String), u8), + word_pair_proximity: &mut impl FnMut((String, String), u8), ) -> Result<()> { let (head_word, head_position) = word_positions.pop_front().unwrap(); for (word, position) in word_positions.iter() { let prox = index_proximity(head_position as u32, *position as u32) as u8; if prox > 0 && prox < MAX_DISTANCE as u8 { word_pair_proximity((head_word.clone(), word.clone()), prox); - // word_pair_proximity - // .entry((head_word.clone(), word.clone())) - // .and_modify(|p| { - // *p = std::cmp::min(*p, prox); - // }) - // .or_insert(prox); } } Ok(()) @@ -155,7 +136,7 @@ fn process_document_tokens( document_tokenizer: &DocumentTokenizer, fields_ids_map: &mut GlobalFieldsIdsMap, word_positions: &mut VecDeque<(String, u16)>, - word_pair_proximity: &mut dyn FnMut((String, String), u8), + word_pair_proximity: &mut impl FnMut((String, String), u8), ) -> Result<()> { let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| { // drain the proximity window until the head word is considered close to the word we are inserting. From f4ab1f168eb622a0b1381012d2e98da4e2c977b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Sep 2024 15:41:29 +0200 Subject: [PATCH 072/247] Prefer using Rc than String when cloning a lot --- .../extract_word_pair_proximity_docids.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 0386297d1..3a6bb7894 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -1,4 +1,5 @@ use std::collections::{HashMap, VecDeque}; +use std::rc::Rc; use heed::RoTxn; use itertools::merge_join_by; @@ -38,7 +39,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { let mut key_buffer = Vec::new(); let mut del_word_pair_proximity = Vec::new(); let mut add_word_pair_proximity = Vec::new(); - let mut word_positions: VecDeque<(String, u16)> = + let mut word_positions: VecDeque<(Rc, u16)> = VecDeque::with_capacity(MAX_DISTANCE as usize); let docid = document_change.docid(); @@ -118,8 +119,8 @@ fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec) -> & } fn word_positions_into_word_pair_proximity( - word_positions: &mut VecDeque<(String, u16)>, - word_pair_proximity: &mut impl FnMut((String, String), u8), + word_positions: &mut VecDeque<(Rc, u16)>, + word_pair_proximity: &mut impl FnMut((Rc, Rc), u8), ) -> Result<()> { let (head_word, head_position) = word_positions.pop_front().unwrap(); for (word, position) in word_positions.iter() { @@ -135,8 +136,8 @@ fn process_document_tokens( document: &KvReader, document_tokenizer: &DocumentTokenizer, fields_ids_map: &mut GlobalFieldsIdsMap, - word_positions: &mut VecDeque<(String, u16)>, - word_pair_proximity: &mut impl FnMut((String, String), u8), + word_positions: &mut VecDeque<(Rc, u16)>, + word_pair_proximity: &mut impl FnMut((Rc, Rc), u8), ) -> Result<()> { let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| { // drain the proximity window until the head word is considered close to the word we are inserting. @@ -148,7 +149,7 @@ fn process_document_tokens( } // insert the new word. - word_positions.push_back((word.to_string(), pos)); + word_positions.push_back((Rc::from(word), pos)); Ok(()) }; document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?; From 013acb3d936b6777124870ada0b07b235f0669a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Sep 2024 11:07:59 +0200 Subject: [PATCH 073/247] Measure merger writer channel contention --- milli/src/update/new/channel.rs | 76 ++++++++++++++++++++++------- milli/src/update/new/indexer/mod.rs | 2 +- 2 files changed, 59 insertions(+), 19 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 98538ea9e..6a47dc606 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -16,7 +16,14 @@ use crate::{DocumentId, Index}; /// The capacity of the channel is currently in number of messages. pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) { let (sender, receiver) = crossbeam_channel::bounded(cap); - (MergerSender(sender), WriterReceiver(receiver)) + ( + MergerSender { + sender, + send_count: Default::default(), + contentious_count: Default::default(), + }, + WriterReceiver(receiver), + ) } /// The capacity of the channel is currently in number of messages. @@ -169,23 +176,40 @@ impl IntoIterator for WriterReceiver { } } -pub struct MergerSender(Sender); +pub struct MergerSender { + sender: Sender, + /// The number of message we send in total in the channel. + send_count: std::cell::Cell, + /// The number of times we sent something in a channel that was full. + contentious_count: std::cell::Cell, +} + +impl Drop for MergerSender { + fn drop(&mut self) { + tracing::info!( + "Merger channel stats: {} sends, {} were contentious (ratio {})", + self.send_count.get(), + self.contentious_count.get(), + self.contentious_count.get() as f64 / self.send_count.get() as f64 + ) + } +} impl MergerSender { pub fn main(&self) -> MainSender<'_> { - MainSender(&self.0) + MainSender(self) } pub fn docids(&self) -> WordDocidsSender<'_, D> { - WordDocidsSender { sender: &self.0, _marker: PhantomData } + WordDocidsSender { sender: self, _marker: PhantomData } } pub fn facet_docids(&self) -> FacetDocidsSender<'_> { - FacetDocidsSender { sender: &self.0 } + FacetDocidsSender { sender: self } } pub fn documents(&self) -> DocumentsSender<'_> { - DocumentsSender(&self.0) + DocumentsSender(self) } pub fn send_documents_ids(&self, bitmap: &[u8]) -> StdResult<(), SendError<()>> { @@ -193,14 +217,25 @@ impl MergerSender { DOCUMENTS_IDS_KEY.as_bytes(), bitmap, )); - match self.0.send(WriterOperation { database: Database::Main, entry }) { + match self.send(WriterOperation { database: Database::Main, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + fn send(&self, op: WriterOperation) -> StdResult<(), SendError<()>> { + if self.sender.is_full() { + self.contentious_count.set(self.contentious_count.get() + 1); + } + self.send_count.set(self.send_count.get() + 1); + match self.sender.send(op) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } } } -pub struct MainSender<'a>(&'a Sender); +pub struct MainSender<'a>(&'a MergerSender); impl MainSender<'_> { pub fn write_words_fst(&self, value: Mmap) -> StdResult<(), SendError<()>> { @@ -311,7 +346,7 @@ pub trait DocidsSender { } pub struct WordDocidsSender<'a, D> { - sender: &'a Sender, + sender: &'a MergerSender, _marker: PhantomData, } @@ -334,7 +369,7 @@ impl DocidsSender for WordDocidsSender<'_, D> { } pub struct FacetDocidsSender<'a> { - sender: &'a Sender, + sender: &'a MergerSender, } impl DocidsSender for FacetDocidsSender<'_> { @@ -370,7 +405,7 @@ impl FacetDocidsSender<'_> { } } -pub struct DocumentsSender<'a>(&'a Sender); +pub struct DocumentsSender<'a>(&'a MergerSender); impl DocumentsSender<'_> { /// TODO do that efficiently @@ -426,7 +461,7 @@ pub struct ExtractorSender(Sender); impl ExtractorSender { pub fn document_sender(&self) -> DocumentSender<'_> { - DocumentSender(&self.0) + DocumentSender(Some(&self.0)) } pub fn send_searchable( @@ -440,7 +475,7 @@ impl ExtractorSender { } } -pub struct DocumentSender<'a>(&'a Sender); +pub struct DocumentSender<'a>(Option<&'a Sender>); impl DocumentSender<'_> { pub fn insert( @@ -448,21 +483,24 @@ impl DocumentSender<'_> { docid: DocumentId, document: Box, ) -> StdResult<(), SendError<()>> { - match self.0.send(MergerOperation::InsertDocument { docid, document }) { + let sender = self.0.unwrap(); + match sender.send(MergerOperation::InsertDocument { docid, document }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } } pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { - match self.0.send(MergerOperation::DeleteDocument { docid }) { + let sender = self.0.unwrap(); + match sender.send(MergerOperation::DeleteDocument { docid }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } } - pub fn finish(self) -> StdResult<(), SendError<()>> { - match self.0.send(MergerOperation::FinishedDocument) { + pub fn finish(mut self) -> StdResult<(), SendError<()>> { + let sender = self.0.take().unwrap(); + match sender.send(MergerOperation::FinishedDocument) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -471,6 +509,8 @@ impl DocumentSender<'_> { impl Drop for DocumentSender<'_> { fn drop(&mut self) { - self.0.send(MergerOperation::FinishedDocument); + if let Some(sender) = self.0.take() { + sender.send(MergerOperation::FinishedDocument); + } } } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index b40ddbc4d..3a9db79b6 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -17,7 +17,7 @@ use super::extract::*; use super::merger::merge_grenad_entries; use super::StdResult; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; -use crate::update::new::channel::{DatabaseType, ExtractorSender}; +use crate::update::new::channel::ExtractorSender; use crate::update::GrenadParameters; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; From 193d7f5d34820c1935d7730f3d3ec81cac91d8b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Sep 2024 14:24:25 +0200 Subject: [PATCH 074/247] Add the mutualized charabia normalization --- Cargo.lock | 8 +++----- Cargo.toml | 1 + milli/Cargo.toml | 3 ++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6eb12d80f..d830a7a4e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -933,9 +933,8 @@ dependencies = [ [[package]] name = "charabia" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03cd8f290cae94934cdd0103c14c2de9faf2d7d85be0d24d511af2bf1b14119d" +version = "0.9.1" +source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#9eea14faf27ad46bd5eed49d2654cbdc4a1068dd" dependencies = [ "aho-corasick", "csv", @@ -2651,8 +2650,7 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" [[package]] name = "irg-kvariants" version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26" +source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#9eea14faf27ad46bd5eed49d2654cbdc4a1068dd" dependencies = [ "csv", "once_cell", diff --git a/Cargo.toml b/Cargo.toml index 3b9219ebc..3d4d20aff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ license = "MIT" [profile.release] codegen-units = 1 +debug = true [profile.dev.package.flate2] opt-level = 3 diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1fa754069..3f2bb09fa 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,7 +18,8 @@ bincode = "1.3.3" bstr = "1.9.1" bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.9.0", default-features = false } +# charabia = { version = "0.9.0", default-features = false } +charabia = { git = "https://github.com/meilisearch/charabia", branch = "mutualize-char-normalizer", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.13" deserr = "0.6.2" From 4551abf6d4e72e79da31f388a44b68f5b79d86d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Sep 2024 14:35:33 +0200 Subject: [PATCH 075/247] Update roaring to the latest version --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index d830a7a4e..e13279989 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4656,7 +4656,7 @@ dependencies = [ [[package]] name = "roaring" version = "0.10.6" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#348e58c2312fc37c0f351373cc7338cea86cf828" +source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#1b32c1efb345ea979da091aba320493447703ce9" dependencies = [ "bytemuck", "byteorder", From 3c63d4a1e5a804983409099a63dfc1b4ad45cbce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Sep 2024 14:50:17 +0200 Subject: [PATCH 076/247] Fix charabia Zho --- meilisearch-types/src/locales.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/meilisearch-types/src/locales.rs b/meilisearch-types/src/locales.rs index c6902dd71..309ec6ae4 100644 --- a/meilisearch-types/src/locales.rs +++ b/meilisearch-types/src/locales.rs @@ -115,7 +115,8 @@ make_locale! { Slk, Cat, Tgl, - Hye + Hye, + Zho } impl std::error::Error for LocaleFormatError {} From f00664247d298a185cb84074630fa13e417d5dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Sep 2024 15:13:52 +0200 Subject: [PATCH 077/247] Add more stats about the channel message sent --- milli/src/update/new/channel.rs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 6a47dc606..c2acd5310 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -20,7 +20,8 @@ pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) { MergerSender { sender, send_count: Default::default(), - contentious_count: Default::default(), + writer_contentious_count: Default::default(), + merger_contentious_count: Default::default(), }, WriterReceiver(receiver), ) @@ -181,16 +182,20 @@ pub struct MergerSender { /// The number of message we send in total in the channel. send_count: std::cell::Cell, /// The number of times we sent something in a channel that was full. - contentious_count: std::cell::Cell, + writer_contentious_count: std::cell::Cell, + /// The number of times we sent something in a channel that was empty. + merger_contentious_count: std::cell::Cell, } impl Drop for MergerSender { fn drop(&mut self) { tracing::info!( - "Merger channel stats: {} sends, {} were contentious (ratio {})", + "Merger channel stats: {} sends, {} writer contentions ({}%), {} merger contentions ({}%)", self.send_count.get(), - self.contentious_count.get(), - self.contentious_count.get() as f64 / self.send_count.get() as f64 + self.writer_contentious_count.get(), + (self.writer_contentious_count.get() as f32 / self.send_count.get() as f32) * 100.0, + self.merger_contentious_count.get(), + (self.merger_contentious_count.get() as f32 / self.send_count.get() as f32) * 100.0 ) } } @@ -225,7 +230,10 @@ impl MergerSender { fn send(&self, op: WriterOperation) -> StdResult<(), SendError<()>> { if self.sender.is_full() { - self.contentious_count.set(self.contentious_count.get() + 1); + self.writer_contentious_count.set(self.writer_contentious_count.get() + 1); + } + if self.sender.is_empty() { + self.merger_contentious_count.set(self.merger_contentious_count.get() + 1); } self.send_count.set(self.send_count.get() + 1); match self.sender.send(op) { From 835c5f98f9c61670fc863f20b91ceaa1ba56e420 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Sep 2024 15:49:24 +0200 Subject: [PATCH 078/247] Remove the debug symbols --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 3d4d20aff..3b9219ebc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,6 @@ license = "MIT" [profile.release] codegen-units = 1 -debug = true [profile.dev.package.flate2] opt-level = 3 From 42b093687d82362432e6f2913e22f252495e8766 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Sep 2024 16:38:21 +0200 Subject: [PATCH 079/247] Introduce the new PushOptimizedBitmap --- milli/src/update/new/extract/cache.rs | 66 +++++++++++++++++++-------- 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 5c3c4a735..5e90b1c79 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -32,7 +32,7 @@ impl CboCachedSorter { pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del, add: _ }) => { - del.get_or_insert_with(RoaringBitmap::new).insert(n); + del.get_or_insert_with(PushOptimizedBitmap::default).insert(n); } None => { let value = DelAddRoaringBitmap::new_del_u32(n); @@ -52,7 +52,7 @@ impl CboCachedSorter { ) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del, add: _ }) => { - *del.get_or_insert_with(RoaringBitmap::new) |= bitmap; + del.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap); } None => { let value = DelAddRoaringBitmap::new_del(bitmap); @@ -68,7 +68,7 @@ impl CboCachedSorter { pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del: _, add }) => { - add.get_or_insert_with(RoaringBitmap::new).insert(n); + add.get_or_insert_with(PushOptimizedBitmap::default).insert(n); } None => { let value = DelAddRoaringBitmap::new_add_u32(n); @@ -88,7 +88,7 @@ impl CboCachedSorter { ) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del: _, add }) => { - *add.get_or_insert_with(RoaringBitmap::new) |= bitmap; + add.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap); } None => { let value = DelAddRoaringBitmap::new_add(bitmap); @@ -104,8 +104,8 @@ impl CboCachedSorter { pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del, add }) => { - del.get_or_insert_with(RoaringBitmap::new).insert(n); - add.get_or_insert_with(RoaringBitmap::new).insert(n); + del.get_or_insert_with(PushOptimizedBitmap::default).insert(n); + add.get_or_insert_with(PushOptimizedBitmap::default).insert(n); } None => { let value = DelAddRoaringBitmap::new_del_add_u32(n); @@ -129,21 +129,21 @@ impl CboCachedSorter { match deladd { DelAddRoaringBitmap { del: Some(del), add: None } => { self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, &mut self.cbo_buffer); + CboRoaringBitmapCodec::serialize_into(&del.bitmap, &mut self.cbo_buffer); value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: Some(add) } => { self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, &mut self.cbo_buffer); + CboRoaringBitmapCodec::serialize_into(&add.bitmap, &mut self.cbo_buffer); value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?; } DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, &mut self.cbo_buffer); + CboRoaringBitmapCodec::serialize_into(&del.bitmap, &mut self.cbo_buffer); value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?; self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, &mut self.cbo_buffer); + CboRoaringBitmapCodec::serialize_into(&add.bitmap, &mut self.cbo_buffer); value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: None } => return Ok(()), @@ -167,31 +167,61 @@ impl CboCachedSorter { #[derive(Debug, Clone)] pub struct DelAddRoaringBitmap { - pub del: Option, - pub add: Option, + pub del: Option, + pub add: Option, } impl DelAddRoaringBitmap { fn new_del_add_u32(n: u32) -> Self { DelAddRoaringBitmap { - del: Some(RoaringBitmap::from([n])), - add: Some(RoaringBitmap::from([n])), + del: Some(PushOptimizedBitmap::from_single(n)), + add: Some(PushOptimizedBitmap::from_single(n)), } } fn new_del(bitmap: RoaringBitmap) -> Self { - DelAddRoaringBitmap { del: Some(bitmap), add: None } + DelAddRoaringBitmap { del: Some(PushOptimizedBitmap::from_bitmap(bitmap)), add: None } } fn new_del_u32(n: u32) -> Self { - DelAddRoaringBitmap { del: Some(RoaringBitmap::from([n])), add: None } + DelAddRoaringBitmap { del: Some(PushOptimizedBitmap::from_single(n)), add: None } } fn new_add(bitmap: RoaringBitmap) -> Self { - DelAddRoaringBitmap { del: None, add: Some(bitmap) } + DelAddRoaringBitmap { del: None, add: Some(PushOptimizedBitmap::from_bitmap(bitmap)) } } fn new_add_u32(n: u32) -> Self { - DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) } + DelAddRoaringBitmap { del: None, add: Some(PushOptimizedBitmap::from_single(n)) } + } +} + +#[derive(Debug, Clone, Default)] +struct PushOptimizedBitmap { + max: Option, + bitmap: RoaringBitmap, +} + +impl PushOptimizedBitmap { + fn from_bitmap(bitmap: RoaringBitmap) -> PushOptimizedBitmap { + PushOptimizedBitmap { max: bitmap.max(), bitmap } + } + + fn from_single(single: u32) -> PushOptimizedBitmap { + PushOptimizedBitmap { max: Some(single), bitmap: RoaringBitmap::from([single]) } + } + + fn insert(&mut self, n: u32) { + if self.max.map_or(true, |max| n > max) { + self.max = Some(n); + self.bitmap.push(n); + } else { + self.bitmap.insert(n); + } + } + + fn union_with_bitmap(&mut self, bitmap: RoaringBitmap) { + self.bitmap |= bitmap; + self.max = self.bitmap.max(); } } From ff931edb5573b7c5aea6306e7e1371c9746cc96f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Sep 2024 16:53:42 +0200 Subject: [PATCH 080/247] Update roaring to inline max calls --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index e13279989..6c1d2f345 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4656,7 +4656,7 @@ dependencies = [ [[package]] name = "roaring" version = "0.10.6" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#1b32c1efb345ea979da091aba320493447703ce9" +source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#6bba84b1a47da1d6e52d5c4dc0ce8593ae4646a5" dependencies = [ "bytemuck", "byteorder", From 4ce5d3d66d202559d489483790375a7b05291eac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 24 Sep 2024 09:42:21 +0200 Subject: [PATCH 081/247] Do not check before pushing in bitmaps --- milli/src/update/new/extract/cache.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 5e90b1c79..00b4dc46d 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -198,30 +198,29 @@ impl DelAddRoaringBitmap { #[derive(Debug, Clone, Default)] struct PushOptimizedBitmap { - max: Option, bitmap: RoaringBitmap, } impl PushOptimizedBitmap { + #[inline] fn from_bitmap(bitmap: RoaringBitmap) -> PushOptimizedBitmap { - PushOptimizedBitmap { max: bitmap.max(), bitmap } + PushOptimizedBitmap { bitmap } } + #[inline] fn from_single(single: u32) -> PushOptimizedBitmap { - PushOptimizedBitmap { max: Some(single), bitmap: RoaringBitmap::from([single]) } + PushOptimizedBitmap { bitmap: RoaringBitmap::from([single]) } } + #[inline] fn insert(&mut self, n: u32) { - if self.max.map_or(true, |max| n > max) { - self.max = Some(n); - self.bitmap.push(n); - } else { + if !self.bitmap.push(n) { self.bitmap.insert(n); } } + #[inline] fn union_with_bitmap(&mut self, bitmap: RoaringBitmap) { self.bitmap |= bitmap; - self.max = self.bitmap.max(); } } From 7f148c127c5d502b8d78e78b11bbda443e30eaf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 24 Sep 2024 15:32:15 +0200 Subject: [PATCH 082/247] Measure the SmallVec efficacity --- milli/src/update/new/extract/cache.rs | 26 ++++++++++++++-- .../extract/searchable/extract_word_docids.rs | 31 ++++++++----------- .../extract_word_pair_proximity_docids.rs | 2 +- .../extract/searchable/tokenize_document.rs | 8 ++--- 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 00b4dc46d..19d5ba04a 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -15,6 +15,8 @@ pub struct CboCachedSorter { sorter: Sorter, deladd_buffer: Vec, cbo_buffer: Vec, + total_insertions: usize, + fitted_in_key: usize, } impl CboCachedSorter { @@ -24,6 +26,8 @@ impl CboCachedSorter { sorter, deladd_buffer: Vec::new(), cbo_buffer: Vec::new(), + total_insertions: 0, + fitted_in_key: 0, } } } @@ -35,6 +39,8 @@ impl CboCachedSorter { del.get_or_insert_with(PushOptimizedBitmap::default).insert(n); } None => { + self.total_insertions += 1; + self.fitted_in_key += (key.len() <= 20) as usize; let value = DelAddRoaringBitmap::new_del_u32(n); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -55,6 +61,8 @@ impl CboCachedSorter { del.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap); } None => { + self.total_insertions += 1; + self.fitted_in_key += (key.len() <= 20) as usize; let value = DelAddRoaringBitmap::new_del(bitmap); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -71,6 +79,8 @@ impl CboCachedSorter { add.get_or_insert_with(PushOptimizedBitmap::default).insert(n); } None => { + self.total_insertions += 1; + self.fitted_in_key += (key.len() <= 20) as usize; let value = DelAddRoaringBitmap::new_add_u32(n); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -91,6 +101,8 @@ impl CboCachedSorter { add.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap); } None => { + self.total_insertions += 1; + self.fitted_in_key += (key.len() <= 20) as usize; let value = DelAddRoaringBitmap::new_add(bitmap); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -108,6 +120,8 @@ impl CboCachedSorter { add.get_or_insert_with(PushOptimizedBitmap::default).insert(n); } None => { + self.total_insertions += 1; + self.fitted_in_key += (key.len() <= 20) as usize; let value = DelAddRoaringBitmap::new_del_add_u32(n); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -161,14 +175,22 @@ impl CboCachedSorter { for (key, deladd) in mem::replace(&mut self.cache, default_arc) { self.write_entry(key, deladd)?; } + + tracing::info!( + "LruCache stats: {} <= 20 bytes ({}%) on a total of {} insertions", + self.fitted_in_key, + (self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0, + self.total_insertions, + ); + Ok(self.sorter) } } #[derive(Debug, Clone)] pub struct DelAddRoaringBitmap { - pub del: Option, - pub add: Option, + pub(crate) del: Option, + pub(crate) add: Option, } impl DelAddRoaringBitmap { diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 652964b11..c5c3cd2a2 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -1,26 +1,21 @@ +use std::borrow::Cow; use std::collections::HashMap; -use std::{borrow::Cow, fs::File, num::NonZero}; +use std::fs::File; +use std::num::NonZero; -use grenad::Merger; -use grenad::MergerBuilder; +use grenad::{Merger, MergerBuilder}; use heed::RoTxn; -use rayon::iter::IntoParallelIterator; -use rayon::iter::ParallelIterator; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; -use super::{ - tokenize_document::{tokenizer_builder, DocumentTokenizer}, - SearchableExtractor, -}; +use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; +use super::SearchableExtractor; +use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; -use crate::DocumentId; +use crate::update::new::{DocumentChange, ItemsPool}; +use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{ - bucketed_position, - update::{ - create_sorter, - new::{extract::cache::CboCachedSorter, DocumentChange, ItemsPool}, - GrenadParameters, MergeDeladdCboRoaringBitmaps, - }, - FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE, + bucketed_position, DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result, + MAX_POSITION_PER_ATTRIBUTE, }; const MAX_COUNTED_WORDS: usize = 30; @@ -565,7 +560,7 @@ impl WordDocidsExtractors { cached_sorter: &mut WordDocidsCachedSorters, document_change: DocumentChange, ) -> Result<()> { - let exact_attributes = index.exact_attributes(&rtxn)?; + let exact_attributes = index.exact_attributes(rtxn)?; let is_exact_attribute = |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr)); let mut buffer = Vec::new(); diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 3a6bb7894..ce8136260 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -59,7 +59,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { DocumentChange::Update(inner) => { let document = inner.current(rtxn, index)?.unwrap(); process_document_tokens( - &document, + document, document_tokenizer, fields_ids_map, &mut word_positions, diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index 829bf8a49..d2795114e 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -92,24 +92,24 @@ impl<'a> DocumentTokenizer<'a> { }; // if the current field is searchable or contains a searchable attribute - if select_field(&field_name, self.attribute_to_extract, self.attribute_to_skip) { + if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) { // parse json. match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { Value::Object(object) => seek_leaf_values_in_object( &object, self.attribute_to_extract, self.attribute_to_skip, - &field_name, + field_name, &mut tokenize_field, )?, Value::Array(array) => seek_leaf_values_in_array( &array, self.attribute_to_extract, self.attribute_to_skip, - &field_name, + field_name, &mut tokenize_field, )?, - value => tokenize_field(&field_name, &value)?, + value => tokenize_field(field_name, &value)?, } } } From 92678383d6179e417570ba2618dcd79393e4ecc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 24 Sep 2024 15:37:56 +0200 Subject: [PATCH 083/247] Update charabia --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6c1d2f345..5127bce0f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -934,7 +934,7 @@ dependencies = [ [[package]] name = "charabia" version = "0.9.1" -source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#9eea14faf27ad46bd5eed49d2654cbdc4a1068dd" +source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71" dependencies = [ "aho-corasick", "csv", @@ -2650,7 +2650,7 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" [[package]] name = "irg-kvariants" version = "0.1.1" -source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#9eea14faf27ad46bd5eed49d2654cbdc4a1068dd" +source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71" dependencies = [ "csv", "once_cell", @@ -2837,7 +2837,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e310b3a6b5907f99202fcdb4960ff45b93735d7c7d96b760fcff8db2dc0e103d" dependencies = [ "cfg-if", - "windows-targets 0.48.1", + "windows-targets 0.52.4", ] [[package]] From 2d1caf27dfc9086f5fc492ee1e3083ad81b48314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 24 Sep 2024 15:59:50 +0200 Subject: [PATCH 084/247] Use eprintln to log --- milli/src/update/new/extract/cache.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 19d5ba04a..572c81a55 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -176,7 +176,7 @@ impl CboCachedSorter { self.write_entry(key, deladd)?; } - tracing::info!( + eprintln!( "LruCache stats: {} <= 20 bytes ({}%) on a total of {} insertions", self.fitted_in_key, (self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0, From 6e87332410bc0a792a7257c3d02eee92011cacb4 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 24 Sep 2024 15:58:18 +0200 Subject: [PATCH 085/247] Change the way the FST is built --- milli/src/update/new/merger.rs | 200 +++++++++++++++++++++++++-------- 1 file changed, 152 insertions(+), 48 deletions(-) diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 9ba81fb11..1c7f04974 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -2,7 +2,7 @@ use std::fs::File; use std::io::{self, BufWriter}; use bincode::ErrorKind; -use fst::{Set, SetBuilder}; +use fst::{Set, SetBuilder, Streamer}; use grenad::Merger; use heed::types::Bytes; use heed::{BoxedError, Database, RoTxn}; @@ -11,8 +11,8 @@ use roaring::RoaringBitmap; use tempfile::tempfile; use super::channel::*; -use super::{Deletion, DocumentChange, Insertion, KvReaderDelAdd, KvReaderFieldId, Update}; use super::extract::FacetKind; +use super::{Deletion, DocumentChange, Insertion, KvReaderDelAdd, KvReaderFieldId, Update}; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::update::MergeDeladdCboRoaringBitmaps; @@ -29,7 +29,7 @@ pub fn merge_grenad_entries( index: &Index, mut global_fields_ids_map: GlobalFieldsIdsMap<'_>, ) -> Result<()> { - let mut buffer = Vec::new(); + let mut buffer: Vec = Vec::new(); let mut documents_ids = index.documents_ids(rtxn)?; let mut geo_extractor = GeoExtractor::new(rtxn, index)?; @@ -46,8 +46,7 @@ pub fn merge_grenad_entries( rtxn, &mut buffer, sender.docids::(), - |_key| Ok(()), - |_key| Ok(()), + |_, _key| Ok(()), )?; } MergerOperation::FidWordCountDocidsMerger(merger) => { @@ -59,13 +58,12 @@ pub fn merge_grenad_entries( rtxn, &mut buffer, sender.docids::(), - |_key| Ok(()), - |_key| Ok(()), + |_, _key| Ok(()), )?; } MergerOperation::WordDocidsMerger(merger) => { - let mut add_words_fst = SetBuilder::new(BufWriter::new(tempfile()?))?; - let mut del_words_fst = SetBuilder::new(BufWriter::new(tempfile()?))?; + let words_fst = index.words_fst(rtxn)?; + let mut word_fst_builder = WordFstBuilder::new(&words_fst, 4)?; { let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); @@ -77,8 +75,7 @@ pub fn merge_grenad_entries( rtxn, &mut buffer, sender.docids::(), - |key| add_words_fst.insert(key), - |key| del_words_fst.insert(key), + |deladd, key| word_fst_builder.register_word(deladd, key), )?; } @@ -86,9 +83,8 @@ pub fn merge_grenad_entries( let span = tracing::trace_span!(target: "indexing::documents::merge", "words_fst"); let _entered = span.enter(); - // Move that into a dedicated function - let words_fst = index.words_fst(rtxn)?; - let mmap = compute_new_words_fst(add_words_fst, del_words_fst, words_fst)?; + + let mmap = word_fst_builder.build()?; sender.main().write_words_fst(mmap).unwrap(); } } @@ -102,8 +98,7 @@ pub fn merge_grenad_entries( rtxn, &mut buffer, sender.docids::(), - |_key| Ok(()), - |_key| Ok(()), + |_, _key| Ok(()), )?; } MergerOperation::WordPairProximityDocidsMerger(merger) => { @@ -115,8 +110,7 @@ pub fn merge_grenad_entries( rtxn, &mut buffer, sender.docids::(), - |_key| Ok(()), - |_key| Ok(()), + |_, _key| Ok(()), )?; } MergerOperation::WordPositionDocidsMerger(merger) => { @@ -128,8 +122,7 @@ pub fn merge_grenad_entries( rtxn, &mut buffer, sender.docids::(), - |_key| Ok(()), - |_key| Ok(()), + |_, _key| Ok(()), )?; } MergerOperation::InsertDocument { docid, document } => { @@ -199,6 +192,142 @@ pub fn merge_grenad_entries( Ok(()) } +struct WordFstBuilder<'a> { + stream: fst::set::Stream<'a>, + word_fst_builder: SetBuilder>, + prefix_fst_builders: Vec>>, + max_prefix_length: usize, + last_word: Vec, +} + +impl<'a> WordFstBuilder<'a> { + pub fn new( + words_fst: &'a Set>, + max_prefix_length: usize, + ) -> Result { + let mut prefix_fst_builders = Vec::new(); + for _ in 0..max_prefix_length { + prefix_fst_builders.push(SetBuilder::new(BufWriter::new(tempfile()?))?); + } + + Ok(Self { + stream: words_fst.stream(), + word_fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?, + prefix_fst_builders, + max_prefix_length, + last_word: Vec::new(), + }) + } + + pub fn register_word(&mut self, deladd: DelAdd, key: &[u8]) -> Result<()> { + match deladd { + DelAdd::Addition => self.add_word(key), + DelAdd::Deletion => self.del_word(key), + } + } + + pub fn add_word(&mut self, word: &[u8]) -> Result<()> { + if !self.last_word.is_empty() { + let next = self.last_word.as_slice(); + match next.cmp(word) { + std::cmp::Ordering::Less => { + // We need to insert the last word from the current fst + self.word_fst_builder.insert(next)?; + self.last_word.clear(); + } + std::cmp::Ordering::Equal => { + // We insert the word and drop the last word + self.word_fst_builder.insert(next)?; + self.last_word.clear(); + return Ok(()); + } + std::cmp::Ordering::Greater => { + // We insert the word and keep the last word + self.word_fst_builder.insert(word)?; + + return Ok(()); + } + } + } + + while let Some(next) = self.stream.next() { + match next.cmp(word) { + std::cmp::Ordering::Less => { + // We need to insert the last word from the current fst + self.word_fst_builder.insert(next)?; + } + std::cmp::Ordering::Equal => { + // We insert the word + self.word_fst_builder.insert(next)?; + + return Ok(()); + } + std::cmp::Ordering::Greater => { + // We insert the word and keep the last word + self.word_fst_builder.insert(word)?; + self.last_word.clear(); + self.last_word.extend_from_slice(next); + + return Ok(()); + } + } + } + + Ok(()) + } + + pub fn del_word(&mut self, word: &[u8]) -> Result<()> { + if !self.last_word.is_empty() { + let next = self.last_word.as_slice(); + match next.cmp(word) { + std::cmp::Ordering::Less => { + // We insert the word from the current fst because the next word to delete is greater + self.word_fst_builder.insert(next)?; + self.last_word.clear(); + } + std::cmp::Ordering::Equal => { + // We delete the word by not inserting it in the new fst and drop the last word + self.last_word.clear(); + return Ok(()); + } + std::cmp::Ordering::Greater => { + // keep the current word until the next word to delete is greater or equal + return Ok(()); + } + } + } + + while let Some(next) = self.stream.next() { + match next.cmp(word) { + std::cmp::Ordering::Less => { + // We insert the word from the current fst because the next word to delete is greater + self.word_fst_builder.insert(next)?; + } + std::cmp::Ordering::Equal => { + // We delete the word by not inserting it in the new fst and drop the last word + return Ok(()); + } + std::cmp::Ordering::Greater => { + // keep the current word until the next word to delete is greater or equal + self.last_word.clear(); + self.last_word.extend_from_slice(next); + + return Ok(()); + } + } + } + + Ok(()) + } + + pub fn build(mut self) -> Result { + let words_fst_file = self.word_fst_builder.into_inner()?.into_inner().unwrap(); + let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; + + Ok(words_fst_mmap) + } +} + pub struct GeoExtractor { rtree: Option>, } @@ -247,30 +376,6 @@ impl GeoExtractor { } } -fn compute_new_words_fst( - add_words_fst: SetBuilder>, - del_words_fst: SetBuilder>, - words_fst: Set>, -) -> Result { - let add_words_fst_file = add_words_fst.into_inner()?; - let add_words_fst_mmap = unsafe { Mmap::map(&add_words_fst_file.into_inner().unwrap())? }; - let add_words_fst = Set::new(&add_words_fst_mmap)?; - - let del_words_fst_file = del_words_fst.into_inner()?; - let del_words_fst_mmap = unsafe { Mmap::map(&del_words_fst_file.into_inner().unwrap())? }; - let del_words_fst = Set::new(&del_words_fst_mmap)?; - - let diff = words_fst.op().add(&del_words_fst).difference(); - let stream = add_words_fst.op().add(diff).union(); - - let mut words_fst = SetBuilder::new(tempfile()?)?; - words_fst.extend_stream(stream)?; - let words_fst_file = words_fst.into_inner()?; - let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; - - Ok(words_fst_mmap) -} - #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] fn merge_and_send_docids( merger: Merger, @@ -278,8 +383,7 @@ fn merge_and_send_docids( rtxn: &RoTxn<'_>, buffer: &mut Vec, docids_sender: impl DocidsSender, - mut add_key: impl FnMut(&[u8]) -> fst::Result<()>, - mut del_key: impl FnMut(&[u8]) -> fst::Result<()>, + mut register_key: impl FnMut(DelAdd, &[u8]) -> Result<()>, ) -> Result<()> { let mut merger_iter = merger.into_stream_merger_iter().unwrap(); while let Some((key, deladd)) = merger_iter.next().unwrap() { @@ -292,11 +396,11 @@ fn merge_and_send_docids( Operation::Write(bitmap) => { let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer); docids_sender.write(key, value).unwrap(); - add_key(key)?; + register_key(DelAdd::Addition, key)?; } Operation::Delete => { docids_sender.delete(key).unwrap(); - del_key(key)?; + register_key(DelAdd::Deletion, key)?; } Operation::Ignore => (), } From e0c7067355be95183ff168fe013f66b56b4f0ec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 24 Sep 2024 17:24:50 +0200 Subject: [PATCH 086/247] Expose an IndexedParallelIterator to the index function --- index-scheduler/src/batch.rs | 6 +- meilisearch/src/routes/indexes/documents.rs | 2 +- meilisearch/src/search/mod.rs | 2 +- milli/src/documents/builder.rs | 20 ++-- milli/src/update/index_documents/transform.rs | 8 +- .../extract_word_pair_proximity_docids.rs | 2 +- .../src/update/new/extract/searchable/mod.rs | 6 +- .../update/new/indexer/document_deletion.rs | 7 +- .../update/new/indexer/document_operation.rs | 99 +++++++++---------- milli/src/update/new/indexer/mod.rs | 11 +-- milli/src/update/new/indexer/partial_dump.rs | 10 +- .../update/new/indexer/update_by_function.rs | 4 +- milli/src/update/new/merger.rs | 6 +- milli/src/update/new/mod.rs | 3 +- .../update/new/{indexer => }/top_level_map.rs | 0 15 files changed, 85 insertions(+), 101 deletions(-) rename milli/src/update/new/{indexer => }/top_level_map.rs (100%) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index f9463a137..5cffb92a8 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -18,7 +18,6 @@ one indexing operation. */ use std::collections::{BTreeSet, HashSet}; -use std::env::VarError; use std::ffi::OsStr; use std::fmt; use std::fs::{self, File}; @@ -27,19 +26,18 @@ use std::io::BufWriter; use dump::IndexMetadata; use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; -use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; +use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::new::indexer::{ self, retrieve_or_guess_primary_key, DocumentChanges, }; -use meilisearch_types::milli::update::new::TopLevelMap; use meilisearch_types::milli::update::{ IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; -use meilisearch_types::milli::{self, Filter, InternalError, Object}; +use meilisearch_types::milli::{self, Filter, Object}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 055685151..85cf33c54 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -1,4 +1,4 @@ -use std::io::{BufReader, ErrorKind}; +use std::io::ErrorKind; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; diff --git a/meilisearch/src/search/mod.rs b/meilisearch/src/search/mod.rs index 54d0c4823..8cdbb31ee 100644 --- a/meilisearch/src/search/mod.rs +++ b/meilisearch/src/search/mod.rs @@ -1247,7 +1247,7 @@ impl<'a> HitMaker<'a> { self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?; // First generate a document with all the displayed fields - let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, &obkv)?; + let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?; let add_vectors_fid = self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve); diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index ec4d634aa..1cf90447e 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -292,7 +292,7 @@ mod test { .unwrap() .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + let val = obkv_to_object(doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -321,7 +321,7 @@ mod test { .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + let val = obkv_to_object(doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -348,7 +348,7 @@ mod test { .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + let val = obkv_to_object(doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -375,7 +375,7 @@ mod test { .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + let val = obkv_to_object(doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -402,7 +402,7 @@ mod test { .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + let val = obkv_to_object(doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -429,7 +429,7 @@ mod test { .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + let val = obkv_to_object(doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -456,7 +456,7 @@ mod test { .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + let val = obkv_to_object(doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -483,7 +483,7 @@ mod test { .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + let val = obkv_to_object(doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -510,7 +510,7 @@ mod test { .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + let val = obkv_to_object(doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -555,7 +555,7 @@ mod test { .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + let val = obkv_to_object(doc, &index).map(Value::from).unwrap(); assert_eq!( val, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 49bada8e7..aea5680a1 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -289,7 +289,7 @@ impl<'a, 'i> Transform<'a, 'i> { .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; let base_obkv = KvReader::from_slice(base_obkv); if let Some(flattened_obkv) = - Self::flatten_from_fields_ids_map(&base_obkv, &mut self.fields_ids_map)? + Self::flatten_from_fields_ids_map(base_obkv, &mut self.fields_ids_map)? { // we recreate our buffer with the flattened documents document_sorter_value_buffer.clear(); @@ -324,7 +324,7 @@ impl<'a, 'i> Transform<'a, 'i> { let flattened_obkv = KvReader::from_slice(&obkv_buffer); if let Some(obkv) = - Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)? + Self::flatten_from_fields_ids_map(flattened_obkv, &mut self.fields_ids_map)? { document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Addition as u8); @@ -531,7 +531,7 @@ impl<'a, 'i> Transform<'a, 'i> { // flatten it and push it as to delete in the flattened_sorter let flattened_obkv = KvReader::from_slice(base_obkv); if let Some(obkv) = - Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)? + Self::flatten_from_fields_ids_map(flattened_obkv, &mut self.fields_ids_map)? { // we recreate our buffer with the flattened documents document_sorter_value_buffer.clear(); @@ -938,7 +938,7 @@ impl<'a, 'i> Transform<'a, 'i> { if let Some(flattened_obkv_buffer) = flattened_obkv_buffer { // take the non-flattened version if flatten_from_fields_ids_map returns None. let mut fields_ids_map = settings_diff.new.fields_ids_map.clone(); - let flattened = Self::flatten_from_fields_ids_map(&obkv, &mut fields_ids_map)?; + let flattened = Self::flatten_from_fields_ids_map(obkv, &mut fields_ids_map)?; let flattened = flattened.as_deref().map_or(obkv, KvReader::from_slice); flattened_obkv_buffer.clear(); diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index ce8136260..5736fc1d4 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -1,4 +1,4 @@ -use std::collections::{HashMap, VecDeque}; +use std::collections::VecDeque; use std::rc::Rc; use heed::RoTxn; diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 48d373598..fe7480fa3 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -5,11 +5,7 @@ mod tokenize_document; use std::fs::File; -pub use extract_fid_word_count_docids::FidWordCountDocidsExtractor; -pub use extract_word_docids::{ - ExactWordDocidsExtractor, WordDocidsExtractor, WordDocidsExtractors, WordDocidsMergers, - WordFidDocidsExtractor, WordPositionDocidsExtractor, -}; +pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers}; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; use grenad::Merger; use heed::RoTxn; diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index b744ec65e..bad72d3b2 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use rayon::iter::{ParallelBridge, ParallelIterator}; +use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; use super::DocumentChanges; @@ -28,10 +28,11 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion { self, _fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, - ) -> Result> + Clone + 'p> { + ) -> Result> + Clone + 'p> { let index = param; let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); - Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| { + let to_delete: Vec<_> = self.to_delete.into_iter().collect(); + Ok(to_delete.into_par_iter().map_with(items, |items, docid| { items.with(|rtxn| { let current = index.document(rtxn, docid)?; Ok(DocumentChange::Deletion(Deletion::create(docid, current.boxed()))) diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index b299124bd..f088370fb 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -2,15 +2,15 @@ use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; -use heed::types::Bytes; +use heed::types::{Bytes, DecodeIgnore}; use heed::RoTxn; use memmap2::Mmap; -use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; use IndexDocumentsMethod as Idm; use super::super::document_change::DocumentChange; use super::super::items_pool::ItemsPool; -use super::top_level_map::{CowStr, TopLevelMap}; +use super::super::{CowStr, TopLevelMap}; use super::DocumentChanges; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update}; @@ -73,7 +73,7 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { self, fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, - ) -> Result> + Clone + 'p> { + ) -> Result> + Clone + 'p> { let (index, rtxn, primary_key) = param; let documents_ids = index.documents_ids(rtxn)?; @@ -199,29 +199,26 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { // And finally sort them docids_version_offsets.sort_unstable_by_key(|(_, (_, docops))| sort_function_key(docops)); - Ok(docids_version_offsets - .into_par_iter() - .map_with( - Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))), - move |context_pool, (external_docid, (internal_docid, operations))| { - context_pool.with(|rtxn| { - let document_merge_function = match self.index_documents_method { - Idm::ReplaceDocuments => MergeDocumentForReplacement::merge, - Idm::UpdateDocuments => MergeDocumentForUpdates::merge, - }; + Ok(docids_version_offsets.into_par_iter().map_with( + Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))), + move |context_pool, (external_docid, (internal_docid, operations))| { + context_pool.with(|rtxn| { + let document_merge_function = match self.index_documents_method { + Idm::ReplaceDocuments => MergeDocumentForReplacement::merge, + Idm::UpdateDocuments => MergeDocumentForUpdates::merge, + }; - document_merge_function( - rtxn, - index, - &fields_ids_map, - internal_docid, - external_docid.to_string(), // TODO do not clone - &operations, - ) - }) - }, - ) - .filter_map(Result::transpose)) + document_merge_function( + rtxn, + index, + &fields_ids_map, + internal_docid, + external_docid.to_string(), // TODO do not clone + &operations, + ) + }) + }, + )) } } @@ -239,7 +236,7 @@ trait MergeChanges { docid: DocumentId, external_docid: String, operations: &[InnerDocOp], - ) -> Result>; + ) -> Result; } struct MergeDocumentForReplacement; @@ -266,7 +263,7 @@ impl MergeChanges for MergeDocumentForReplacement { docid: DocumentId, external_docid: String, operations: &[InnerDocOp], - ) -> Result> { + ) -> Result { let current = index.documents.remap_data_type::().get(rtxn, &docid)?; let current: Option<&KvReaderFieldId> = current.map(Into::into); @@ -288,21 +285,21 @@ impl MergeChanges for MergeDocumentForReplacement { let new = writer.into_boxed(); match current { - Some(current) => Ok(Some(DocumentChange::Update(Update::create( - docid, - current.boxed(), - new, - )))), - None => Ok(Some(DocumentChange::Insertion(Insertion::create(docid, new)))), + Some(current) => { + let update = Update::create(docid, current.boxed(), new); + Ok(DocumentChange::Update(update)) + } + None => Ok(DocumentChange::Insertion(Insertion::create(docid, new))), } } - Some(InnerDocOp::Deletion) => match current { - Some(current) => { - Ok(Some(DocumentChange::Deletion(Deletion::create(docid, current.boxed())))) - } - None => Ok(None), - }, - None => Ok(None), // but it's strange + Some(InnerDocOp::Deletion) => { + let deletion = match current { + Some(current) => Deletion::create(docid, current.boxed()), + None => todo!("Do that with Louis"), + }; + Ok(DocumentChange::Deletion(deletion)) + } + None => unreachable!("We must not have empty set of operations on a document"), } } } @@ -332,13 +329,13 @@ impl MergeChanges for MergeDocumentForUpdates { docid: DocumentId, external_docid: String, operations: &[InnerDocOp], - ) -> Result> { + ) -> Result { let mut document = BTreeMap::<_, Cow<_>>::new(); let current = index.documents.remap_data_type::().get(rtxn, &docid)?; let current: Option<&KvReaderFieldId> = current.map(Into::into); if operations.is_empty() { - return Ok(None); // but it's strange + unreachable!("We must not have empty set of operations on a document"); } let last_deletion = operations.iter().rposition(|op| matches!(op, InnerDocOp::Deletion)); @@ -355,13 +352,11 @@ impl MergeChanges for MergeDocumentForUpdates { } if operations.is_empty() { - match current { - Some(current) => { - let deletion = Deletion::create(docid, current.boxed()); - return Ok(Some(DocumentChange::Deletion(deletion))); - } - None => return Ok(None), - } + let deletion = match current { + Some(current) => Deletion::create(docid, current.boxed()), + None => todo!("Do that with Louis"), + }; + return Ok(DocumentChange::Deletion(deletion)); } for operation in operations { @@ -386,11 +381,11 @@ impl MergeChanges for MergeDocumentForUpdates { match current { Some(current) => { let update = Update::create(docid, current.boxed(), new); - Ok(Some(DocumentChange::Update(update))) + Ok(DocumentChange::Update(update)) } None => { let insertion = Insertion::create(docid, new); - Ok(Some(DocumentChange::Insertion(insertion))) + Ok(DocumentChange::Insertion(insertion)) } } } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 3a9db79b6..b317aefca 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -6,16 +6,15 @@ pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; use heed::{RoTxn, RwTxn}; pub use partial_dump::PartialDump; -use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; use rayon::ThreadPool; -pub use top_level_map::{CowStr, TopLevelMap}; pub use update_by_function::UpdateByFunction; use super::channel::*; use super::document_change::DocumentChange; use super::extract::*; use super::merger::merge_grenad_entries; -use super::StdResult; +use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::update::new::channel::ExtractorSender; use crate::update::GrenadParameters; @@ -24,7 +23,6 @@ use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; mod document_deletion; mod document_operation; mod partial_dump; -mod top_level_map; mod update_by_function; pub trait DocumentChanges<'p> { @@ -34,7 +32,7 @@ pub trait DocumentChanges<'p> { self, fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, - ) -> Result> + Clone + 'p>; + ) -> Result> + Clone + 'p>; } /// This is the main function of this crate. @@ -50,8 +48,7 @@ pub fn index( document_changes: PI, ) -> Result<()> where - PI: IntoParallelIterator> + Send, - PI::Iter: Clone, + PI: IndexedParallelIterator> + Send + Clone, { let (merger_sender, writer_receiver) = merger_writer_channel(10_000); // This channel acts as a rendezvous point to ensure that we are one task ahead diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 50768ba82..5f8743e31 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -1,4 +1,4 @@ -use rayon::iter::{ParallelBridge, ParallelIterator}; +use rayon::iter::{IndexedParallelIterator, ParallelBridge, ParallelIterator}; use super::DocumentChanges; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; @@ -18,9 +18,7 @@ impl PartialDump { impl<'p, I> DocumentChanges<'p> for PartialDump where - I: IntoIterator, - I::IntoIter: Send + Clone + 'p, - I::Item: Send, + I: IndexedParallelIterator + Clone + 'p, { type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>); @@ -32,10 +30,10 @@ where self, _fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, - ) -> Result> + Clone + 'p> { + ) -> Result> + Clone + 'p> { let (fields_ids_map, concurrent_available_ids, primary_key) = param; - Ok(self.iter.into_iter().par_bridge().map(|object| { + Ok(self.iter.map(|object| { let docid = match concurrent_available_ids.next() { Some(id) => id, None => return Err(Error::UserError(UserError::DocumentLimitReached)), diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index 36ff432f8..d4c0f837b 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -1,4 +1,4 @@ -use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; use super::DocumentChanges; use crate::update::new::DocumentChange; @@ -13,7 +13,7 @@ impl<'p> DocumentChanges<'p> for UpdateByFunction { self, _fields_ids_map: &mut FieldsIdsMap, _param: Self::Parameter, - ) -> Result> + Clone + 'p> { + ) -> Result> + Clone + 'p> { Ok((0..100).into_par_iter().map(|_| todo!())) } } diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 1c7f04974..ca6b213c1 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -5,7 +5,7 @@ use bincode::ErrorKind; use fst::{Set, SetBuilder, Streamer}; use grenad::Merger; use heed::types::Bytes; -use heed::{BoxedError, Database, RoTxn}; +use heed::{Database, RoTxn}; use memmap2::Mmap; use roaring::RoaringBitmap; use tempfile::tempfile; @@ -16,9 +16,7 @@ use super::{Deletion, DocumentChange, Insertion, KvReaderDelAdd, KvReaderFieldId use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{ - CboRoaringBitmapCodec, Error, GeoPoint, GlobalFieldsIdsMap, Index, InternalError, Result, -}; +use crate::{CboRoaringBitmapCodec, Error, GeoPoint, GlobalFieldsIdsMap, Index, Result}; /// TODO We must return some infos/stats #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")] diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 3f5c4b3c9..6389a53c4 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -1,6 +1,6 @@ pub use document_change::{Deletion, DocumentChange, Insertion, Update}; -pub use indexer::{CowStr, TopLevelMap}; pub use items_pool::ItemsPool; +pub use top_level_map::{CowStr, TopLevelMap}; use super::del_add::DelAdd; use crate::FieldId; @@ -11,6 +11,7 @@ mod extract; pub mod indexer; mod items_pool; mod merger; +mod top_level_map; /// TODO move them elsewhere pub type StdResult = std::result::Result; diff --git a/milli/src/update/new/indexer/top_level_map.rs b/milli/src/update/new/top_level_map.rs similarity index 100% rename from milli/src/update/new/indexer/top_level_map.rs rename to milli/src/update/new/top_level_map.rs From 7ad037841f9bca9cccefb1b48043ca094dc94446 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 24 Sep 2024 18:21:58 +0200 Subject: [PATCH 087/247] Move the tracing info to eprintln --- milli/src/update/new/channel.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index c2acd5310..237581cb3 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -189,7 +189,7 @@ pub struct MergerSender { impl Drop for MergerSender { fn drop(&mut self) { - tracing::info!( + eprintln!( "Merger channel stats: {} sends, {} writer contentions ({}%), {} merger contentions ({}%)", self.send_count.get(), self.writer_contentious_count.get(), From 3f7a500f3b108f2e2b34688e9b6f16ff1bc5b2cb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Sep 2024 14:15:18 +0200 Subject: [PATCH 088/247] Build prefix fst --- milli/src/update/new/merger.rs | 146 +----------------- milli/src/update/new/mod.rs | 1 + milli/src/update/new/word_fst_builder.rs | 187 +++++++++++++++++++++++ 3 files changed, 192 insertions(+), 142 deletions(-) create mode 100644 milli/src/update/new/word_fst_builder.rs diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index ca6b213c1..7e1a80888 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -1,20 +1,18 @@ use std::fs::File; -use std::io::{self, BufWriter}; +use std::io::{self}; use bincode::ErrorKind; -use fst::{Set, SetBuilder, Streamer}; use grenad::Merger; use heed::types::Bytes; use heed::{Database, RoTxn}; -use memmap2::Mmap; use roaring::RoaringBitmap; -use tempfile::tempfile; use super::channel::*; use super::extract::FacetKind; use super::{Deletion, DocumentChange, Insertion, KvReaderDelAdd, KvReaderFieldId, Update}; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; +use crate::update::new::word_fst_builder::WordFstBuilder; use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{CboRoaringBitmapCodec, Error, GeoPoint, GlobalFieldsIdsMap, Index, Result}; @@ -82,8 +80,8 @@ pub fn merge_grenad_entries( tracing::trace_span!(target: "indexing::documents::merge", "words_fst"); let _entered = span.enter(); - let mmap = word_fst_builder.build()?; - sender.main().write_words_fst(mmap).unwrap(); + let (word_fst_mmap, prefix_fst_mmap) = word_fst_builder.build()?; + sender.main().write_words_fst(word_fst_mmap).unwrap(); } } MergerOperation::WordFidDocidsMerger(merger) => { @@ -190,142 +188,6 @@ pub fn merge_grenad_entries( Ok(()) } -struct WordFstBuilder<'a> { - stream: fst::set::Stream<'a>, - word_fst_builder: SetBuilder>, - prefix_fst_builders: Vec>>, - max_prefix_length: usize, - last_word: Vec, -} - -impl<'a> WordFstBuilder<'a> { - pub fn new( - words_fst: &'a Set>, - max_prefix_length: usize, - ) -> Result { - let mut prefix_fst_builders = Vec::new(); - for _ in 0..max_prefix_length { - prefix_fst_builders.push(SetBuilder::new(BufWriter::new(tempfile()?))?); - } - - Ok(Self { - stream: words_fst.stream(), - word_fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?, - prefix_fst_builders, - max_prefix_length, - last_word: Vec::new(), - }) - } - - pub fn register_word(&mut self, deladd: DelAdd, key: &[u8]) -> Result<()> { - match deladd { - DelAdd::Addition => self.add_word(key), - DelAdd::Deletion => self.del_word(key), - } - } - - pub fn add_word(&mut self, word: &[u8]) -> Result<()> { - if !self.last_word.is_empty() { - let next = self.last_word.as_slice(); - match next.cmp(word) { - std::cmp::Ordering::Less => { - // We need to insert the last word from the current fst - self.word_fst_builder.insert(next)?; - self.last_word.clear(); - } - std::cmp::Ordering::Equal => { - // We insert the word and drop the last word - self.word_fst_builder.insert(next)?; - self.last_word.clear(); - return Ok(()); - } - std::cmp::Ordering::Greater => { - // We insert the word and keep the last word - self.word_fst_builder.insert(word)?; - - return Ok(()); - } - } - } - - while let Some(next) = self.stream.next() { - match next.cmp(word) { - std::cmp::Ordering::Less => { - // We need to insert the last word from the current fst - self.word_fst_builder.insert(next)?; - } - std::cmp::Ordering::Equal => { - // We insert the word - self.word_fst_builder.insert(next)?; - - return Ok(()); - } - std::cmp::Ordering::Greater => { - // We insert the word and keep the last word - self.word_fst_builder.insert(word)?; - self.last_word.clear(); - self.last_word.extend_from_slice(next); - - return Ok(()); - } - } - } - - Ok(()) - } - - pub fn del_word(&mut self, word: &[u8]) -> Result<()> { - if !self.last_word.is_empty() { - let next = self.last_word.as_slice(); - match next.cmp(word) { - std::cmp::Ordering::Less => { - // We insert the word from the current fst because the next word to delete is greater - self.word_fst_builder.insert(next)?; - self.last_word.clear(); - } - std::cmp::Ordering::Equal => { - // We delete the word by not inserting it in the new fst and drop the last word - self.last_word.clear(); - return Ok(()); - } - std::cmp::Ordering::Greater => { - // keep the current word until the next word to delete is greater or equal - return Ok(()); - } - } - } - - while let Some(next) = self.stream.next() { - match next.cmp(word) { - std::cmp::Ordering::Less => { - // We insert the word from the current fst because the next word to delete is greater - self.word_fst_builder.insert(next)?; - } - std::cmp::Ordering::Equal => { - // We delete the word by not inserting it in the new fst and drop the last word - return Ok(()); - } - std::cmp::Ordering::Greater => { - // keep the current word until the next word to delete is greater or equal - self.last_word.clear(); - self.last_word.extend_from_slice(next); - - return Ok(()); - } - } - } - - Ok(()) - } - - pub fn build(mut self) -> Result { - let words_fst_file = self.word_fst_builder.into_inner()?.into_inner().unwrap(); - let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; - - Ok(words_fst_mmap) - } -} - pub struct GeoExtractor { rtree: Option>, } diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 6389a53c4..dedd89497 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -12,6 +12,7 @@ pub mod indexer; mod items_pool; mod merger; mod top_level_map; +mod word_fst_builder; /// TODO move them elsewhere pub type StdResult = std::result::Result; diff --git a/milli/src/update/new/word_fst_builder.rs b/milli/src/update/new/word_fst_builder.rs new file mode 100644 index 000000000..227a81d9d --- /dev/null +++ b/milli/src/update/new/word_fst_builder.rs @@ -0,0 +1,187 @@ +use std::{fs::File, io::BufWriter}; + +use fst::{Set, SetBuilder, Streamer}; +use memmap2::Mmap; +use tempfile::tempfile; + +use crate::{update::del_add::DelAdd, Result, SmallString32}; + +pub struct WordFstBuilder<'a> { + stream: Option>, + word_fst_builder: SetBuilder>, + /// TODO: Replace the full memory allocation + prefix_fst_builders: Vec>>, + max_prefix_length: usize, + last_word: Option>, + current_prefix: Vec, + current_prefix_count: Vec, + prefix_count_threshold: u64, +} + +impl<'a> WordFstBuilder<'a> { + pub fn new( + words_fst: &'a Set>, + max_prefix_length: usize, + ) -> Result { + let mut prefix_fst_builders = Vec::new(); + for _ in 0..max_prefix_length { + prefix_fst_builders.push(SetBuilder::memory()); + } + + Ok(Self { + stream: Some(words_fst.stream()), + word_fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?, + prefix_fst_builders, + max_prefix_length, + last_word: None, + current_prefix: vec![SmallString32::new(); max_prefix_length], + current_prefix_count: vec![0; max_prefix_length], + prefix_count_threshold: 100, + }) + } + + pub fn register_word(&mut self, deladd: DelAdd, right: &[u8]) -> Result<()> { + if let Some(left) = self.last_word.take() { + let (left_inserted, right_inserted) = + self.compare_and_insert(deladd, left.as_slice(), right)?; + + // left was not inserted, so we keep it for the next iteration + if !left_inserted { + self.last_word = Some(left); + } + + // right was inserted, so we can stop + if right_inserted { + return Ok(()); + } + } + + if let Some(mut stream) = self.stream.take() { + while let Some(left) = stream.next() { + let (left_inserted, right_inserted) = + self.compare_and_insert(deladd, left, right)?; + + // left was not inserted, so we keep it for the next iteration + if !left_inserted { + self.last_word = Some(left.to_vec()); + } + + // right was inserted, so we can stop + if right_inserted { + break; + } + } + + self.stream = Some(stream); + } + + Ok(()) + } + + pub fn compare_and_insert( + &mut self, + deladd: DelAdd, + left: &[u8], + right: &[u8], + ) -> Result<(bool, bool)> { + let mut left_inserted = false; + let mut right_inserted = false; + match left.cmp(right) { + std::cmp::Ordering::Less => { + // We need to insert the last word from the current fst + self.insert_word(left)?; + + left_inserted = true; + } + std::cmp::Ordering::Equal => { + // Addition: We insert the word + // Deletion: We delete the word by not inserting it + if deladd == DelAdd::Addition { + self.insert_word(right)?; + } + + left_inserted = true; + right_inserted = true; + } + std::cmp::Ordering::Greater => { + // Addition: We insert the word and keep the last word + // Deletion: We keep the current word until the left word to delete is greater or equal + if deladd == DelAdd::Addition { + self.insert_word(right)?; + } + + right_inserted = true; + } + } + + Ok((left_inserted, right_inserted)) + } + + fn insert_word(&mut self, bytes: &[u8]) -> Result<()> { + self.word_fst_builder.insert(bytes)?; + + for n in 0..self.max_prefix_length { + let current_prefix = &mut self.current_prefix[n]; + let current_prefix_count = &mut self.current_prefix_count[n]; + let builder = &mut self.prefix_fst_builders[n]; + + // We try to get the first n bytes out of this string but we only want + // to split at valid characters bounds. If we try to split in the middle of + // a character we ignore this word and go to the next one. + let word = std::str::from_utf8(bytes)?; + let prefix = match word.get(..=n) { + Some(prefix) => prefix, + None => continue, + }; + + // This is the first iteration of the loop, + // or the current word doesn't starts with the current prefix. + if *current_prefix_count == 0 || prefix != current_prefix.as_str() { + *current_prefix = SmallString32::from(prefix); + *current_prefix_count = 0; + } + + *current_prefix_count += 1; + + // There is enough words corresponding to this prefix to add it to the cache. + /// TODO: (LEGACY) Replace this by `==` to avoid inserting several times the same prefix? + if *current_prefix_count >= self.prefix_count_threshold { + builder.insert(prefix)?; + } + } + + Ok(()) + } + + fn drain_stream(&mut self) -> Result<()> { + if let Some(mut stream) = self.stream.take() { + while let Some(current) = stream.next() { + self.insert_word(current)?; + } + } + + Ok(()) + } + + pub fn build(mut self) -> Result<(Mmap, Mmap)> { + self.drain_stream()?; + + /// TODO: ugly unwrap + let words_fst_file = self.word_fst_builder.into_inner()?.into_inner().unwrap(); + let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; + + // We merge all of the previously computed prefixes into on final set. + let mut prefix_fsts = Vec::new(); + for builder in self.prefix_fst_builders { + prefix_fsts.push(builder.into_set()); + } + let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); + let mut builder = SetBuilder::new(BufWriter::new(tempfile()?))?; + builder.extend_stream(op.r#union())?; + /// TODO: ugly unwrap + let prefix_fst_file = builder.into_inner()?.into_inner().unwrap(); + let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? }; + + Ok((words_fst_mmap, prefix_fst_mmap)) + } +} From 759b9b15465d0ce80393de803ecab872147ce854 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 25 Sep 2024 14:49:03 +0200 Subject: [PATCH 089/247] Introduce a new custom Lru --- Cargo.lock | 11 +- milli/Cargo.toml | 1 + milli/src/update/new/lru.rs | 230 ++++++++++++++++++++++++++++++++++++ milli/src/update/new/mod.rs | 1 + 4 files changed, 238 insertions(+), 5 deletions(-) create mode 100644 milli/src/update/new/lru.rs diff --git a/Cargo.lock b/Cargo.lock index a4a677e73..7b3de4a6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2307,9 +2307,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash 0.8.11", "allocator-api2", @@ -2591,7 +2591,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" dependencies = [ "equivalent", - "hashbrown 0.14.3", + "hashbrown 0.14.5", "serde", ] @@ -3318,7 +3318,7 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" dependencies = [ - "hashbrown 0.14.3", + "hashbrown 0.14.5", ] [[package]] @@ -3575,6 +3575,7 @@ dependencies = [ "fxhash", "geoutils", "grenad", + "hashbrown 0.14.5", "heed", "hf-hub", "indexmap", @@ -6049,7 +6050,7 @@ version = "0.16.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0" dependencies = [ - "hashbrown 0.14.3", + "hashbrown 0.14.5", "once_cell", ] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index aed966758..19986de01 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -89,6 +89,7 @@ tracing = "0.1.40" ureq = { version = "2.10.0", features = ["json"] } url = "2.5.2" rayon-par-bridge = "0.1.0" +hashbrown = "0.14.5" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/milli/src/update/new/lru.rs b/milli/src/update/new/lru.rs new file mode 100644 index 000000000..fef108753 --- /dev/null +++ b/milli/src/update/new/lru.rs @@ -0,0 +1,230 @@ +use std::borrow::Borrow; +use std::hash::{BuildHasher, Hash}; +use std::iter::repeat_with; +use std::mem; +use std::num::NonZeroUsize; + +use hashbrown::hash_map::{DefaultHashBuilder, Entry}; +use hashbrown::HashMap; + +pub struct Lru { + lookup: HashMap, + storage: FixedSizeList>, +} + +impl Lru { + /// Creates a new LRU cache that holds at most `capacity` elements. + pub fn new(capacity: NonZeroUsize) -> Self { + Self { lookup: HashMap::new(), storage: FixedSizeList::new(capacity.get()) } + } +} + +impl Lru { + /// Creates a new LRU cache that holds at most `capacity` elements + /// and uses the provided hash builder to hash keys. + pub fn with_hasher(capacity: NonZeroUsize, hash_builder: S) -> Lru { + Self { + lookup: HashMap::with_hasher(hash_builder), + storage: FixedSizeList::new(capacity.get()), + } + } +} + +impl Lru { + /// Returns a mutable reference to the value of the key in the cache or `None` if it is not present in the cache. + /// + /// Moves the key to the head of the LRU list if it exists. + pub fn get_mut(&mut self, key: &Q) -> Option<&mut V> + where + K: Borrow, + Q: Hash + Eq + ?Sized, + { + let idx = *self.lookup.get(key)?; + self.storage.move_front(idx).map(|node| &mut node.value) + } +} + +impl Lru { + pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> { + match self.lookup.entry(key) { + Entry::Occupied(occ) => { + // It's fine to unwrap here because: + // * the entry already exists + let node = self.storage.move_front(*occ.get()).unwrap(); + let old_value = mem::replace(&mut node.value, value); + let old_key = occ.replace_key(); + Some((old_key, old_value)) + } + Entry::Vacant(vac) => { + let key = vac.key().clone(); + if self.storage.is_full() { + let idx = self.storage.back_idx(); + // It's fine to unwrap here because: + // * the cache capacity is non zero + // * the cache is full + let node = self.storage.move_front(idx).unwrap(); + let LruNode { key, value } = mem::replace(node, LruNode { key, value }); + vac.insert(idx); + self.lookup.remove(&key); + Some((key, value)) + } else { + // It's fine to unwrap here because: + // * the cache capacity is non zero + // * the cache is not full + let (idx, _) = self.storage.push_front(LruNode { key, value }).unwrap(); + vac.insert(idx); + None + } + } + } + } +} + +impl IntoIterator for Lru { + type Item = (K, V); + type IntoIter = IntoIter; + + fn into_iter(self) -> Self::IntoIter { + IntoIter { lookup_iter: self.lookup.into_iter(), nodes: self.storage.nodes } + } +} + +pub struct IntoIter { + lookup_iter: hashbrown::hash_map::IntoIter, + nodes: Box<[Option>>]>, +} + +impl Iterator for IntoIter { + type Item = (K, V); + + fn next(&mut self) -> Option { + let (_key, idx) = self.lookup_iter.next()?; + let LruNode { key, value } = self.nodes.get_mut(idx)?.take()?.data; + Some((key, value)) + } +} + +struct LruNode { + key: K, + value: V, +} + +struct FixedSizeListNode { + prev: usize, + next: usize, + data: T, +} + +struct FixedSizeList { + nodes: Box<[Option>]>, + // An un-ordered set of indices that are not in use in `nodes`. + // All `None` entries in `nodes` _must_ be listed in `free`. + // A `Vec` was choosen in order to have O(1) complexity + // for pop and avoid having to go through `nodes` in order to + // to find a free place. + // TODO remove the free list as it is always growing: + // we cannot remove entries from the map. + // Also, we probably do not need one of the front and back cursors. + free: Vec, + front: usize, + back: usize, +} + +impl FixedSizeList { + fn new(capacity: usize) -> Self { + Self { + nodes: repeat_with(|| None).take(capacity).collect::>().into_boxed_slice(), + free: (0..capacity).collect(), + front: usize::MAX, + back: usize::MAX, + } + } + + #[inline] + fn capacity(&self) -> usize { + self.nodes.len() + } + + #[inline] + fn len(&self) -> usize { + self.nodes.len() - self.free.len() + } + + #[inline] + fn is_empty(&self) -> bool { + self.len() == 0 + } + + #[inline] + fn is_full(&self) -> bool { + self.len() == self.capacity() + } + + #[inline] + fn back_idx(&self) -> usize { + self.back + } + + #[inline] + fn next(&mut self) -> Option { + self.free.pop() + } + + #[inline] + fn node_mut(&mut self, idx: usize) -> Option<&mut FixedSizeListNode> { + self.nodes.get_mut(idx).and_then(|node| node.as_mut()) + } + + #[inline] + fn node_ref(&self, idx: usize) -> Option<&FixedSizeListNode> { + self.nodes.get(idx).and_then(|node| node.as_ref()) + } + + #[inline] + fn move_front(&mut self, idx: usize) -> Option<&mut T> { + let node = self.nodes.get_mut(idx)?.take()?; + if let Some(prev) = self.node_mut(node.prev) { + prev.next = node.next; + } else { + self.front = node.next; + } + if let Some(next) = self.node_mut(node.next) { + next.prev = node.prev; + } else { + self.back = node.prev; + } + + if let Some(front) = self.node_mut(self.front) { + front.prev = idx; + } + if self.node_ref(self.back).is_none() { + self.back = idx; + } + + let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode { + prev: usize::MAX, + next: self.front, + data: node.data, + }); + self.front = idx; + Some(&mut node.data) + } + + #[inline] + fn push_front(&mut self, data: T) -> Option<(usize, &mut T)> { + let idx = self.next()?; + if let Some(front) = self.node_mut(self.front) { + front.prev = idx; + } + if self.node_ref(self.back).is_none() { + self.back = idx; + } + let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode { + prev: usize::MAX, + next: self.front, + data, + }); + self.front = idx; + Some((idx, &mut node.data)) + } +} diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index dedd89497..b4878a8fe 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -10,6 +10,7 @@ mod document_change; mod extract; pub mod indexer; mod items_pool; +mod lru; mod merger; mod top_level_map; mod word_fst_builder; From 86d5e6d9ff22969980b553e37bd1baed052dc7d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 25 Sep 2024 14:54:56 +0200 Subject: [PATCH 090/247] Use the new Lru --- Cargo.lock | 10 ---------- Cargo.toml | 21 --------------------- milli/Cargo.toml | 1 - milli/src/update/new/extract/cache.rs | 8 ++++---- milli/src/update/new/{ => extract}/lru.rs | 4 ++++ milli/src/update/new/extract/mod.rs | 8 +++----- milli/src/update/new/mod.rs | 1 - 7 files changed, 11 insertions(+), 42 deletions(-) rename milli/src/update/new/{ => extract}/lru.rs (99%) diff --git a/Cargo.lock b/Cargo.lock index 7b3de4a6a..06bd9c234 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3312,15 +3312,6 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" -[[package]] -name = "lru" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" -dependencies = [ - "hashbrown 0.14.5", -] - [[package]] name = "lzma-rs" version = "0.3.0" @@ -3584,7 +3575,6 @@ dependencies = [ "json-depth-checker", "levenshtein_automata", "liquid", - "lru", "maplit", "md5", "meili-snap", diff --git a/Cargo.toml b/Cargo.toml index a73ac67ef..1d25b9795 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,26 +44,5 @@ opt-level = 3 [profile.dev.package.roaring] opt-level = 3 -[profile.dev.package.lindera-ipadic-builder] -opt-level = 3 -[profile.dev.package.encoding] -opt-level = 3 -[profile.dev.package.yada] -opt-level = 3 - -[profile.release.package.lindera-ipadic-builder] -opt-level = 3 -[profile.release.package.encoding] -opt-level = 3 -[profile.release.package.yada] -opt-level = 3 - -[profile.bench.package.lindera-ipadic-builder] -opt-level = 3 -[profile.bench.package.encoding] -opt-level = 3 -[profile.bench.package.yada] -opt-level = 3 - [patch.crates-io] roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "clone-iter-slice" } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 19986de01..bae3dd64b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -40,7 +40,6 @@ heed = { version = "0.20.3", default-features = false, features = [ indexmap = { version = "2.2.6", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } -lru = "0.12.3" memchr = "2.5.0" memmap2 = "0.9.4" obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 572c81a55..4f6f30e70 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -2,16 +2,16 @@ use std::mem; use std::num::NonZeroUsize; use grenad::{MergeFunction, Sorter}; -use lru::LruCache; use roaring::RoaringBitmap; use smallvec::SmallVec; +use super::lru::Lru; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::CboRoaringBitmapCodec; #[derive(Debug)] pub struct CboCachedSorter { - cache: lru::LruCache, DelAddRoaringBitmap>, + cache: Lru, DelAddRoaringBitmap>, sorter: Sorter, deladd_buffer: Vec, cbo_buffer: Vec, @@ -22,7 +22,7 @@ pub struct CboCachedSorter { impl CboCachedSorter { pub fn new(cap: NonZeroUsize, sorter: Sorter) -> Self { CboCachedSorter { - cache: lru::LruCache::new(cap), + cache: Lru::new(cap), sorter, deladd_buffer: Vec::new(), cbo_buffer: Vec::new(), @@ -171,7 +171,7 @@ impl CboCachedSorter { } pub fn into_sorter(mut self) -> grenad::Result, MF::Error> { - let default_arc = LruCache::new(NonZeroUsize::MIN); + let default_arc = Lru::new(NonZeroUsize::MIN); for (key, deladd) in mem::replace(&mut self.cache, default_arc) { self.write_entry(key, deladd)?; } diff --git a/milli/src/update/new/lru.rs b/milli/src/update/new/extract/lru.rs similarity index 99% rename from milli/src/update/new/lru.rs rename to milli/src/update/new/extract/lru.rs index fef108753..7c13d9350 100644 --- a/milli/src/update/new/lru.rs +++ b/milli/src/update/new/extract/lru.rs @@ -7,6 +7,7 @@ use std::num::NonZeroUsize; use hashbrown::hash_map::{DefaultHashBuilder, Entry}; use hashbrown::HashMap; +#[derive(Debug)] pub struct Lru { lookup: HashMap, storage: FixedSizeList>, @@ -104,17 +105,20 @@ impl Iterator for IntoIter { } } +#[derive(Debug)] struct LruNode { key: K, value: V, } +#[derive(Debug)] struct FixedSizeListNode { prev: usize, next: usize, data: T, } +#[derive(Debug)] struct FixedSizeList { nodes: Box<[Option>]>, // An un-ordered set of indices that are not in use in `nodes`. diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 3836f9957..d1f6bb787 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -1,5 +1,6 @@ mod cache; mod faceted; +mod lru; mod searchable; use std::fs::File; @@ -9,12 +10,9 @@ use grenad::Merger; use rayon::iter::IntoParallelIterator; pub use searchable::*; -use crate::{ - update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}, - GlobalFieldsIdsMap, Index, Result, -}; - use super::DocumentChange; +use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::{GlobalFieldsIdsMap, Index, Result}; pub trait DocidsExtractor { fn run_extraction( diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index b4878a8fe..dedd89497 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -10,7 +10,6 @@ mod document_change; mod extract; pub mod indexer; mod items_pool; -mod lru; mod merger; mod top_level_map; mod word_fst_builder; From 52d7f3ed1ce0f85f3e927460f64356e9910bc2ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 25 Sep 2024 15:37:13 +0200 Subject: [PATCH 091/247] Reduce the lru key size from 20 to 8 bytes --- milli/src/update/new/extract/cache.rs | 2 +- milli/src/update/new/extract/lru.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 4f6f30e70..1c7db0473 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -11,7 +11,7 @@ use crate::CboRoaringBitmapCodec; #[derive(Debug)] pub struct CboCachedSorter { - cache: Lru, DelAddRoaringBitmap>, + cache: Lru, DelAddRoaringBitmap>, sorter: Sorter, deladd_buffer: Vec, cbo_buffer: Vec, diff --git a/milli/src/update/new/extract/lru.rs b/milli/src/update/new/extract/lru.rs index 7c13d9350..cfec6e157 100644 --- a/milli/src/update/new/extract/lru.rs +++ b/milli/src/update/new/extract/lru.rs @@ -59,10 +59,10 @@ impl Lru { Entry::Vacant(vac) => { let key = vac.key().clone(); if self.storage.is_full() { - let idx = self.storage.back_idx(); // It's fine to unwrap here because: // * the cache capacity is non zero // * the cache is full + let idx = self.storage.back_idx(); let node = self.storage.move_front(idx).unwrap(); let LruNode { key, value } = mem::replace(node, LruNode { key, value }); vac.insert(idx); @@ -128,8 +128,8 @@ struct FixedSizeList { // to find a free place. // TODO remove the free list as it is always growing: // we cannot remove entries from the map. - // Also, we probably do not need one of the front and back cursors. free: Vec, + // TODO Also, we probably do not need one of the front and back cursors. front: usize, back: usize, } From e97041f7d0e89dc352552265bf39f9b68963c631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 25 Sep 2024 15:55:52 +0200 Subject: [PATCH 092/247] Replace the Lru free list by a simple increment --- milli/src/update/new/extract/lru.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/milli/src/update/new/extract/lru.rs b/milli/src/update/new/extract/lru.rs index cfec6e157..8a35cc440 100644 --- a/milli/src/update/new/extract/lru.rs +++ b/milli/src/update/new/extract/lru.rs @@ -121,14 +121,8 @@ struct FixedSizeListNode { #[derive(Debug)] struct FixedSizeList { nodes: Box<[Option>]>, - // An un-ordered set of indices that are not in use in `nodes`. - // All `None` entries in `nodes` _must_ be listed in `free`. - // A `Vec` was choosen in order to have O(1) complexity - // for pop and avoid having to go through `nodes` in order to - // to find a free place. - // TODO remove the free list as it is always growing: - // we cannot remove entries from the map. - free: Vec, + /// The next None in the nodes. + next_free: usize, // TODO Also, we probably do not need one of the front and back cursors. front: usize, back: usize, @@ -138,7 +132,7 @@ impl FixedSizeList { fn new(capacity: usize) -> Self { Self { nodes: repeat_with(|| None).take(capacity).collect::>().into_boxed_slice(), - free: (0..capacity).collect(), + next_free: 0, front: usize::MAX, back: usize::MAX, } @@ -151,7 +145,7 @@ impl FixedSizeList { #[inline] fn len(&self) -> usize { - self.nodes.len() - self.free.len() + self.nodes.len() - self.next_free } #[inline] @@ -171,7 +165,12 @@ impl FixedSizeList { #[inline] fn next(&mut self) -> Option { - self.free.pop() + if self.is_full() { + None + } else { + self.next_free += 1; + Some(self.next_free) + } } #[inline] From 29a7623c3f4cfed9e3b7e0cefa4bf74c4fdb2b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 25 Sep 2024 15:57:50 +0200 Subject: [PATCH 093/247] Fxi some logs --- milli/src/update/new/extract/cache.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 1c7db0473..1e95d9cda 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -9,9 +9,11 @@ use super::lru::Lru; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::CboRoaringBitmapCodec; +const KEY_SIZE: usize = 8; + #[derive(Debug)] pub struct CboCachedSorter { - cache: Lru, DelAddRoaringBitmap>, + cache: Lru, DelAddRoaringBitmap>, sorter: Sorter, deladd_buffer: Vec, cbo_buffer: Vec, @@ -40,7 +42,7 @@ impl CboCachedSorter { } None => { self.total_insertions += 1; - self.fitted_in_key += (key.len() <= 20) as usize; + self.fitted_in_key += (key.len() <= KEY_SIZE) as usize; let value = DelAddRoaringBitmap::new_del_u32(n); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -62,7 +64,7 @@ impl CboCachedSorter { } None => { self.total_insertions += 1; - self.fitted_in_key += (key.len() <= 20) as usize; + self.fitted_in_key += (key.len() <= KEY_SIZE) as usize; let value = DelAddRoaringBitmap::new_del(bitmap); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -80,7 +82,7 @@ impl CboCachedSorter { } None => { self.total_insertions += 1; - self.fitted_in_key += (key.len() <= 20) as usize; + self.fitted_in_key += (key.len() <= KEY_SIZE) as usize; let value = DelAddRoaringBitmap::new_add_u32(n); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -102,7 +104,7 @@ impl CboCachedSorter { } None => { self.total_insertions += 1; - self.fitted_in_key += (key.len() <= 20) as usize; + self.fitted_in_key += (key.len() <= KEY_SIZE) as usize; let value = DelAddRoaringBitmap::new_add(bitmap); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -121,7 +123,7 @@ impl CboCachedSorter { } None => { self.total_insertions += 1; - self.fitted_in_key += (key.len() <= 20) as usize; + self.fitted_in_key += (key.len() <= KEY_SIZE) as usize; let value = DelAddRoaringBitmap::new_del_add_u32(n); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -177,7 +179,7 @@ impl CboCachedSorter { } eprintln!( - "LruCache stats: {} <= 20 bytes ({}%) on a total of {} insertions", + "LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions", self.fitted_in_key, (self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0, self.total_insertions, From 5f53935c8aa06558d23200f68419fe26eb4c392e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 25 Sep 2024 16:01:08 +0200 Subject: [PATCH 094/247] Fix a bug in the Lru --- milli/src/update/new/extract/lru.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/milli/src/update/new/extract/lru.rs b/milli/src/update/new/extract/lru.rs index 8a35cc440..7346c38af 100644 --- a/milli/src/update/new/extract/lru.rs +++ b/milli/src/update/new/extract/lru.rs @@ -121,7 +121,7 @@ struct FixedSizeListNode { #[derive(Debug)] struct FixedSizeList { nodes: Box<[Option>]>, - /// The next None in the nodes. + /// The first `None` in the nodes. next_free: usize, // TODO Also, we probably do not need one of the front and back cursors. front: usize, @@ -145,7 +145,7 @@ impl FixedSizeList { #[inline] fn len(&self) -> usize { - self.nodes.len() - self.next_free + self.next_free } #[inline] @@ -168,8 +168,9 @@ impl FixedSizeList { if self.is_full() { None } else { + let current_free = self.next_free; self.next_free += 1; - Some(self.next_free) + Some(current_free) } } From 3d244451dfd5903694f0c59ed34f334cf91fdf96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 25 Sep 2024 16:14:13 +0200 Subject: [PATCH 095/247] Reduce the lru key size from 8 to 12 bytes --- milli/src/update/new/extract/cache.rs | 2 +- milli/src/update/new/extract/lru.rs | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 1e95d9cda..1b7a58472 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -9,7 +9,7 @@ use super::lru::Lru; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::CboRoaringBitmapCodec; -const KEY_SIZE: usize = 8; +const KEY_SIZE: usize = 12; #[derive(Debug)] pub struct CboCachedSorter { diff --git a/milli/src/update/new/extract/lru.rs b/milli/src/update/new/extract/lru.rs index 7346c38af..3eca47cb2 100644 --- a/milli/src/update/new/extract/lru.rs +++ b/milli/src/update/new/extract/lru.rs @@ -121,8 +121,8 @@ struct FixedSizeListNode { #[derive(Debug)] struct FixedSizeList { nodes: Box<[Option>]>, - /// The first `None` in the nodes. - next_free: usize, + /// Also corresponds to the first `None` in the nodes. + length: usize, // TODO Also, we probably do not need one of the front and back cursors. front: usize, back: usize, @@ -132,7 +132,7 @@ impl FixedSizeList { fn new(capacity: usize) -> Self { Self { nodes: repeat_with(|| None).take(capacity).collect::>().into_boxed_slice(), - next_free: 0, + length: 0, front: usize::MAX, back: usize::MAX, } @@ -145,7 +145,7 @@ impl FixedSizeList { #[inline] fn len(&self) -> usize { - self.next_free + self.length } #[inline] @@ -168,8 +168,8 @@ impl FixedSizeList { if self.is_full() { None } else { - let current_free = self.next_free; - self.next_free += 1; + let current_free = self.length; + self.length += 1; Some(current_free) } } From 960060ebdf4a9804c4ede1409ca7b6961d53556f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Sep 2024 16:52:43 +0200 Subject: [PATCH 096/247] Fix fst builder when their is no previous FST --- milli/src/update/new/word_fst_builder.rs | 25 +++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/milli/src/update/new/word_fst_builder.rs b/milli/src/update/new/word_fst_builder.rs index 227a81d9d..1e02bdc3b 100644 --- a/milli/src/update/new/word_fst_builder.rs +++ b/milli/src/update/new/word_fst_builder.rs @@ -16,6 +16,9 @@ pub struct WordFstBuilder<'a> { current_prefix: Vec, current_prefix_count: Vec, prefix_count_threshold: u64, + inserted_words: usize, + registered_words: usize, + base_set_length: usize, } impl<'a> WordFstBuilder<'a> { @@ -37,10 +40,17 @@ impl<'a> WordFstBuilder<'a> { current_prefix: vec![SmallString32::new(); max_prefix_length], current_prefix_count: vec![0; max_prefix_length], prefix_count_threshold: 100, + inserted_words: 0, + registered_words: 0, + base_set_length: words_fst.len(), }) } pub fn register_word(&mut self, deladd: DelAdd, right: &[u8]) -> Result<()> { + if deladd == DelAdd::Addition { + self.registered_words += 1; + } + if let Some(left) = self.last_word.take() { let (left_inserted, right_inserted) = self.compare_and_insert(deladd, left.as_slice(), right)?; @@ -68,10 +78,15 @@ impl<'a> WordFstBuilder<'a> { // right was inserted, so we can stop if right_inserted { - break; + self.stream = Some(stream); + return Ok(()); } } + // If we reach this point, it means that the stream is empty + // and we need to insert the incoming word + self.insert_word(right)?; + self.stream = Some(stream); } @@ -118,6 +133,7 @@ impl<'a> WordFstBuilder<'a> { } fn insert_word(&mut self, bytes: &[u8]) -> Result<()> { + self.inserted_words += 1; self.word_fst_builder.insert(bytes)?; for n in 0..self.max_prefix_length { @@ -182,6 +198,13 @@ impl<'a> WordFstBuilder<'a> { let prefix_fst_file = builder.into_inner()?.into_inner().unwrap(); let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? }; + eprintln!("================================================"); + eprintln!( + "inserted words: {}, registered words: {}, base set len: {}", + self.inserted_words, self.registered_words, self.base_set_length + ); + eprintln!("================================================"); + Ok((words_fst_mmap, prefix_fst_mmap)) } } From 079f2b5de0ea814620f1100fa6f4f064621e3c31 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Sep 2024 11:34:31 +0200 Subject: [PATCH 097/247] Format error messages consistently --- milli/src/error.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 400d3d3be..e6856c4ef 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -31,23 +31,23 @@ pub enum Error { pub enum InternalError { #[error("{}", HeedError::DatabaseClosing)] DatabaseClosing, - #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] + #[error("missing {} in the {db_name} database", key.unwrap_or("key"))] DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, - #[error("Missing {key} in the fieldids weights mapping.")] + #[error("missing {key} in the fieldids weights mapping")] FieldidsWeightsMapMissingEntry { key: FieldId }, #[error(transparent)] FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry), - #[error("Missing {key} in the field id mapping.")] + #[error("missing {key} in the field id mapping")] FieldIdMappingMissingEntry { key: FieldId }, #[error(transparent)] Fst(#[from] fst::Error), #[error(transparent)] DocumentsError(#[from] documents::Error), - #[error("Invalid compression type have been specified to grenad")] + #[error("invalid compression type have been specified to grenad")] GrenadInvalidCompressionType, - #[error("Invalid grenad file with an invalid version format")] + #[error("invalid grenad file with an invalid version format")] GrenadInvalidFormatVersion, - #[error("Invalid merge while processing {process}")] + #[error("invalid merge while processing {process}")] IndexingMergingKeys { process: &'static str }, #[error(transparent)] RayonThreadPool(#[from] ThreadPoolBuildError), From 4b0da0ff243e374f2ad347235c2733e5ec530ae3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Sep 2024 11:34:50 +0200 Subject: [PATCH 098/247] Fix inversion of field_id and position --- .../update/new/extract/searchable/extract_word_docids.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index c5c3cd2a2..fb79de7b9 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -296,13 +296,13 @@ impl WordDocidsCachedSorters { buffer.clear(); buffer.extend_from_slice(word.as_bytes()); buffer.push(0); - buffer.extend_from_slice(&position.to_be_bytes()); + buffer.extend_from_slice(&field_id.to_be_bytes()); self.word_fid_docids.insert_add_u32(buffer, docid)?; buffer.clear(); buffer.extend_from_slice(word.as_bytes()); buffer.push(0); - buffer.extend_from_slice(&field_id.to_be_bytes()); + buffer.extend_from_slice(&position.to_be_bytes()); self.word_position_docids.insert_add_u32(buffer, docid)?; if self.current_docid.map_or(false, |id| docid != id) { @@ -337,13 +337,13 @@ impl WordDocidsCachedSorters { buffer.clear(); buffer.extend_from_slice(word.as_bytes()); buffer.push(0); - buffer.extend_from_slice(&position.to_be_bytes()); + buffer.extend_from_slice(&field_id.to_be_bytes()); self.word_fid_docids.insert_del_u32(buffer, docid)?; buffer.clear(); buffer.extend_from_slice(word.as_bytes()); buffer.push(0); - buffer.extend_from_slice(&field_id.to_be_bytes()); + buffer.extend_from_slice(&position.to_be_bytes()); self.word_position_docids.insert_del_u32(buffer, docid)?; if self.current_docid.map_or(false, |id| docid != id) { From 3d8024fb2bdbd697b18452d8fe5599ff0f540d71 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Sep 2024 11:35:03 +0200 Subject: [PATCH 099/247] write the weighted fields ids map --- milli/src/update/new/indexer/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index b317aefca..19132b5db 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -17,6 +17,7 @@ use super::merger::merge_grenad_entries; use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::update::new::channel::ExtractorSender; +use crate::update::settings::InnerIndexSettings; use crate::update::GrenadParameters; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; @@ -245,6 +246,11 @@ where let fields_ids_map = fields_ids_map_lock.into_inner().unwrap(); index.put_fields_ids_map(wtxn, &fields_ids_map)?; + // used to update the localized and weighted maps while sharing the update code with the settings pipeline. + let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn)?; + inner_index_settings.recompute_facets(wtxn, index)?; + inner_index_settings.recompute_searchables(wtxn, index)?; + Ok(()) } From 2b51a63418625d53da66af3deea40222ba98b1ec Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 30 Sep 2024 11:42:36 +0200 Subject: [PATCH 100/247] Remove dead code --- .../extract/searchable/extract_word_docids.rs | 175 +----------------- .../extract_word_pair_proximity_docids.rs | 1 - milli/src/update/new/indexer/mod.rs | 36 ---- 3 files changed, 1 insertion(+), 211 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index fb79de7b9..20a69d4bc 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::collections::HashMap; use std::fs::File; use std::num::NonZero; @@ -8,7 +7,6 @@ use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use super::SearchableExtractor; use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::{DocumentChange, ItemsPool}; @@ -20,178 +18,6 @@ use crate::{ const MAX_COUNTED_WORDS: usize = 30; -trait ProtoWordDocidsExtractor { - fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>; - fn attributes_to_extract<'a>( - _rtxn: &'a RoTxn, - _index: &'a Index, - ) -> Result>>; - - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; -} - -impl SearchableExtractor for T -where - T: ProtoWordDocidsExtractor, -{ - fn extract_document_change( - rtxn: &RoTxn, - index: &Index, - document_tokenizer: &DocumentTokenizer, - fields_ids_map: &mut GlobalFieldsIdsMap, - cached_sorter: &mut CboCachedSorter, - document_change: DocumentChange, - ) -> Result<()> { - match document_change { - DocumentChange::Deletion(inner) => { - let mut token_fn = |_fname: &str, fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from) - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - } - DocumentChange::Update(inner) => { - let mut token_fn = |_fname: &str, fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from) - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - - let mut token_fn = |_fname: &str, fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from) - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - } - DocumentChange::Insertion(inner) => { - let mut token_fn = |_fname: &str, fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from) - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - } - } - - Ok(()) - } - - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - Self::attributes_to_extract(rtxn, index) - } - - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - Self::attributes_to_skip(rtxn, index) - } -} - -pub struct WordDocidsExtractor; -impl ProtoWordDocidsExtractor for WordDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } - - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - // exact attributes must be skipped and stored in a separate DB, see `ExactWordDocidsExtractor`. - index.exact_attributes(rtxn).map_err(Into::into) - } - - /// TODO write in an external Vec buffer - fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { - Cow::Borrowed(word.as_bytes()) - } -} - -pub struct ExactWordDocidsExtractor; -impl ProtoWordDocidsExtractor for ExactWordDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - let exact_attributes = index.exact_attributes(rtxn)?; - // If there are no user-defined searchable fields, we return all exact attributes. - // Otherwise, we return the intersection of exact attributes and user-defined searchable fields. - if let Some(searchable_attributes) = index.user_defined_searchable_fields(rtxn)? { - let attributes = exact_attributes - .into_iter() - .filter(|attr| searchable_attributes.contains(attr)) - .collect(); - Ok(Some(attributes)) - } else { - Ok(Some(exact_attributes)) - } - } - - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(vec![]) - } - - fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { - Cow::Borrowed(word.as_bytes()) - } -} - -pub struct WordFidDocidsExtractor; -impl ProtoWordDocidsExtractor for WordFidDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } - - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(vec![]) - } - - fn build_key(field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { - let mut key = Vec::new(); - key.extend_from_slice(word.as_bytes()); - key.push(0); - key.extend_from_slice(&field_id.to_be_bytes()); - Cow::Owned(key) - } -} - -pub struct WordPositionDocidsExtractor; -impl ProtoWordDocidsExtractor for WordPositionDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } - - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(vec![]) - } - - fn build_key(_field_id: FieldId, position: u16, word: &str) -> Cow<[u8]> { - // position must be bucketed to reduce the number of keys in the DB. - let position = bucketed_position(position); - let mut key = Vec::new(); - key.extend_from_slice(word.as_bytes()); - key.push(0); - key.extend_from_slice(&position.to_be_bytes()); - Cow::Owned(key) - } -} - -// V2 - struct WordDocidsCachedSorters { word_fid_docids: CboCachedSorter, word_docids: CboCachedSorter, @@ -340,6 +166,7 @@ impl WordDocidsCachedSorters { buffer.extend_from_slice(&field_id.to_be_bytes()); self.word_fid_docids.insert_del_u32(buffer, docid)?; + let position = bucketed_position(position); buffer.clear(); buffer.extend_from_slice(word.as_bytes()); buffer.push(0); diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 5736fc1d4..7d3655be8 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -2,7 +2,6 @@ use std::collections::VecDeque; use std::rc::Rc; use heed::RoTxn; -use itertools::merge_join_by; use obkv::KvReader; use super::tokenize_document::DocumentTokenizer; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 19132b5db..5187e4f4c 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -133,42 +133,6 @@ where extractor_sender.send_searchable::(fid_word_count_docids).unwrap(); } - // { - // let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids"); - // let _entered = span.enter(); - // extract_and_send_docids::( - // index, - // &global_fields_ids_map, - // grenad_parameters, - // document_changes.clone(), - // &extractor_sender, - // )?; - // } - - // { - // let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids"); - // let _entered = span.enter(); - // extract_and_send_docids::( - // index, - // &global_fields_ids_map, - // grenad_parameters, - // document_changes.clone(), - // &extractor_sender, - // )?; - // } - - // { - // let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids"); - // let _entered = span.enter(); - // extract_and_send_docids::( - // index, - // &global_fields_ids_map, - // GrenadParameters::default(), - // document_changes.clone(), - // &extractor_sender, - // )?; - // } - { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); let _entered = span.enter(); From 5b552caf42f18a555655b3c09f01b4a1d396a10a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 30 Sep 2024 11:46:32 +0200 Subject: [PATCH 101/247] Fix position in insertions --- milli/src/update/new/extract/searchable/extract_word_docids.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 20a69d4bc..caab170a4 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -125,6 +125,7 @@ impl WordDocidsCachedSorters { buffer.extend_from_slice(&field_id.to_be_bytes()); self.word_fid_docids.insert_add_u32(buffer, docid)?; + let position = bucketed_position(position); buffer.clear(); buffer.extend_from_slice(word.as_bytes()); buffer.push(0); From 8df6daf30836b6ded0e1823e60f280e46c21d362 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 30 Sep 2024 11:52:31 +0200 Subject: [PATCH 102/247] Remove fid_wordcount_docids.rs --- .../extract_fid_word_count_docids.rs | 124 ------------------ .../src/update/new/extract/searchable/mod.rs | 1 - 2 files changed, 125 deletions(-) delete mode 100644 milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs diff --git a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs deleted file mode 100644 index b4cf50190..000000000 --- a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs +++ /dev/null @@ -1,124 +0,0 @@ -use std::collections::HashMap; - -use heed::RoTxn; - -use super::tokenize_document::DocumentTokenizer; -use super::SearchableExtractor; -use crate::update::new::extract::cache::CboCachedSorter; -use crate::update::new::DocumentChange; -use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; - -const MAX_COUNTED_WORDS: usize = 30; - -pub struct FidWordCountDocidsExtractor; -impl SearchableExtractor for FidWordCountDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } - - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(vec![]) - } - - // This method is reimplemented to count the number of words in the document in each field - // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS. - fn extract_document_change( - rtxn: &RoTxn, - index: &Index, - document_tokenizer: &DocumentTokenizer, - fields_ids_map: &mut GlobalFieldsIdsMap, - cached_sorter: &mut CboCachedSorter, - document_change: DocumentChange, - ) -> Result<()> { - let mut key_buffer = Vec::new(); - match document_change { - DocumentChange::Deletion(inner) => { - let mut fid_word_count = HashMap::new(); - let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| { - fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1); - Ok(()) - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - - // The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are deleted. - for (fid, count) in fid_word_count.iter() { - if *count <= MAX_COUNTED_WORDS { - let key = build_key(*fid, *count as u8, &mut key_buffer); - cached_sorter.insert_del_u32(key, inner.docid())?; - } - } - } - DocumentChange::Update(inner) => { - let mut fid_word_count = HashMap::new(); - let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| { - fid_word_count - .entry(fid) - .and_modify(|(current_count, _new_count)| *current_count += 1) - .or_insert((1, 0)); - Ok(()) - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - - let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| { - fid_word_count - .entry(fid) - .and_modify(|(_current_count, new_count)| *new_count += 1) - .or_insert((0, 1)); - Ok(()) - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - - // Only the fields that have a change in the number of words are updated. - for (fid, (current_count, new_count)) in fid_word_count.iter() { - if *current_count != *new_count { - if *current_count <= MAX_COUNTED_WORDS { - let key = build_key(*fid, *current_count as u8, &mut key_buffer); - cached_sorter.insert_del_u32(key, inner.docid())?; - } - if *new_count <= MAX_COUNTED_WORDS { - let key = build_key(*fid, *new_count as u8, &mut key_buffer); - cached_sorter.insert_add_u32(key, inner.docid())?; - } - } - } - } - DocumentChange::Insertion(inner) => { - let mut fid_word_count = HashMap::new(); - let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| { - fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1); - Ok(()) - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - - // The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are stored. - for (fid, count) in fid_word_count.iter() { - if *count <= MAX_COUNTED_WORDS { - let key = build_key(*fid, *count as u8, &mut key_buffer); - cached_sorter.insert_add_u32(key, inner.docid())?; - } - } - } - } - - Ok(()) - } -} - -fn build_key(fid: FieldId, count: u8, key_buffer: &mut Vec) -> &[u8] { - key_buffer.clear(); - key_buffer.extend_from_slice(&fid.to_be_bytes()); - key_buffer.push(count); - key_buffer.as_slice() -} diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index fe7480fa3..c79bd4766 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -1,4 +1,3 @@ -mod extract_fid_word_count_docids; mod extract_word_docids; mod extract_word_pair_proximity_docids; mod tokenize_document; From 64589278ac2e5cb74d16b3ba0e5e5a17b606d8cd Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Sep 2024 16:08:29 +0200 Subject: [PATCH 103/247] Appease *some* of clippy warnings --- milli/src/update/index_documents/transform.rs | 2 +- milli/src/update/new/channel.rs | 2 +- milli/src/update/new/extract/mod.rs | 8 ++++---- .../update/new/extract/searchable/tokenize_document.rs | 4 ++-- milli/src/update/new/indexer/document_operation.rs | 2 +- milli/src/update/new/indexer/mod.rs | 5 +++-- milli/src/update/new/indexer/partial_dump.rs | 2 +- 7 files changed, 13 insertions(+), 12 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 04c9e9256..65007aa32 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -122,7 +122,7 @@ impl<'a, 'i> Transform<'a, 'i> { // We initialize the sorter with the user indexing settings. let original_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_function.clone(), + merge_function, indexer_settings.chunk_compression_type, indexer_settings.chunk_compression_level, indexer_settings.max_nb_chunks, diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 237581cb3..d9823096e 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -518,7 +518,7 @@ impl DocumentSender<'_> { impl Drop for DocumentSender<'_> { fn drop(&mut self) { if let Some(sender) = self.0.take() { - sender.send(MergerOperation::FinishedDocument); + let _ = sender.send(MergerOperation::FinishedDocument); } } } diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index d1f6bb787..6e60a4063 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -58,7 +58,7 @@ pub mod perm_json_p { seeker: &mut impl FnMut(&str, &Value) -> Result<()>, ) -> Result<()> { if value.is_empty() { - seeker(&base_key, &Value::Object(Map::with_capacity(0)))?; + seeker(base_key, &Value::Object(Map::with_capacity(0)))?; } for (key, value) in value.iter() { @@ -103,7 +103,7 @@ pub mod perm_json_p { seeker: &mut impl FnMut(&str, &Value) -> Result<()>, ) -> Result<()> { if values.is_empty() { - seeker(&base_key, &Value::Array(vec![]))?; + seeker(base_key, &Value::Array(vec![]))?; } for value in values { @@ -128,10 +128,10 @@ pub mod perm_json_p { ) -> bool { selectors.map_or(true, |selectors| { selectors.iter().any(|selector| { - contained_in(selector, &field_name) || contained_in(&field_name, selector) + contained_in(selector, field_name) || contained_in(field_name, selector) }) }) && !skip_selectors.iter().any(|skip_selector| { - contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector) + contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector) }) } } diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index d2795114e..fda619013 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -48,7 +48,7 @@ impl<'a> DocumentTokenizer<'a> { .entry(field_id) .and_modify(|counter| *counter += MAX_DISTANCE) .or_insert(0); - if *position as u32 >= self.max_positions_per_attributes { + if *position >= self.max_positions_per_attributes { return Ok(()); } @@ -72,7 +72,7 @@ impl<'a> DocumentTokenizer<'a> { *position, self.tokenizer.tokenize_with_allow_list(text.as_str(), locales), ) - .take_while(|(p, _)| (*p as u32) < self.max_positions_per_attributes); + .take_while(|(p, _)| *p < self.max_positions_per_attributes); for (index, token) in tokens { // keep a word only if it is not empty and fit in a LMDB key. diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index f088370fb..572ea8528 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; -use heed::types::{Bytes, DecodeIgnore}; +use heed::types::Bytes; use heed::RoTxn; use memmap2::Mmap; use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 5187e4f4c..57821c51a 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -6,7 +6,7 @@ pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; use heed::{RoTxn, RwTxn}; pub use partial_dump::PartialDump; -use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; +use rayon::iter::{IndexedParallelIterator, IntoParallelIterator}; use rayon::ThreadPool; pub use update_by_function::UpdateByFunction; @@ -229,7 +229,8 @@ fn extract_and_send_docids( sender: &ExtractorSender, ) -> Result<()> { let merger = E::run_extraction(index, fields_ids_map, indexer, document_changes)?; - Ok(sender.send_searchable::(merger).unwrap()) + sender.send_searchable::(merger).unwrap(); + Ok(()) } /// Returns the primary key *field id* that has already been set for this index or the diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 5f8743e31..43a89c46c 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -1,4 +1,4 @@ -use rayon::iter::{IndexedParallelIterator, ParallelBridge, ParallelIterator}; +use rayon::iter::IndexedParallelIterator; use super::DocumentChanges; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; From bb7a503e5d34b9bc1757a7b8a95002ebb8c9fdde Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 1 Oct 2024 09:56:49 +0200 Subject: [PATCH 104/247] Compute prefix databases We are now computing the prefix FST and a prefix delta in the Merger thread, after all the databases are written, the main thread will recompute the prefix databases based on the prefix delta without needing any grenad temporary file anymore --- milli/src/lib.rs | 1 + milli/src/update/new/channel.rs | 13 +- milli/src/update/new/indexer/mod.rs | 21 +- milli/src/update/new/merger.rs | 31 ++- milli/src/update/new/mod.rs | 1 + milli/src/update/new/word_fst_builder.rs | 215 +++++++++++++------- milli/src/update/new/words_prefix_docids.rs | 108 ++++++++++ 7 files changed, 313 insertions(+), 77 deletions(-) create mode 100644 milli/src/update/new/words_prefix_docids.rs diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 45418c074..48b03b6cc 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -88,6 +88,7 @@ pub type Object = serde_json::Map; pub type Position = u32; pub type RelativePosition = u16; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; +pub type Prefix = smallstr::SmallString<[u8; 16]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index d9823096e..10c0a706b 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -8,7 +8,7 @@ use memmap2::Mmap; use super::extract::FacetKind; use super::StdResult; -use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY}; +use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; use crate::update::new::KvReaderFieldId; use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{DocumentId, Index}; @@ -257,6 +257,17 @@ impl MainSender<'_> { } } + pub fn write_words_prefixes_fst(&self, value: Mmap) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value( + WORDS_PREFIXES_FST_KEY.as_bytes(), + value, + )); + match self.0.send(WriterOperation { database: Database::Main, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Delete(KeyEntry::from_key(key)); match self.0.send(WriterOperation { database: Database::Main, entry }) { diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 57821c51a..e30333b3a 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -14,6 +14,10 @@ use super::channel::*; use super::document_change::DocumentChange; use super::extract::*; use super::merger::merge_grenad_entries; +use super::word_fst_builder::PrefixDelta; +use super::words_prefix_docids::{ + compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, +}; use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::update::new::channel::ExtractorSender; @@ -174,7 +178,7 @@ where // TODO manage the errors correctly let current_span = tracing::Span::current(); - let handle2 = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || { + let merger_thread = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || { let span = tracing::trace_span!(target: "indexing::documents", parent: ¤t_span, "merge"); let _entered = span.enter(); @@ -202,7 +206,20 @@ where /// TODO handle the panicking threads handle.join().unwrap()?; - handle2.join().unwrap()?; + let merger_result = merger_thread.join().unwrap()?; + + if let Some(prefix_delta) = merger_result.prefix_delta { + let span = tracing::trace_span!(target: "indexing", "prefix"); + let _entered = span.enter(); + + let PrefixDelta { modified, deleted } = prefix_delta; + // Compute word prefix docids + compute_word_prefix_docids(wtxn, index, &modified, &deleted)?; + // Compute word prefix fid docids + compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted)?; + // Compute word prefix position docids + compute_word_prefix_position_docids(wtxn, index, &modified, &deleted)?; + } Ok(()) as Result<_> })?; diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 7e1a80888..0d80f75ec 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -6,15 +6,17 @@ use grenad::Merger; use heed::types::Bytes; use heed::{Database, RoTxn}; use roaring::RoaringBitmap; +use std::collections::HashSet; use super::channel::*; use super::extract::FacetKind; +use super::word_fst_builder::{PrefixData, PrefixDelta, PrefixSettings}; use super::{Deletion, DocumentChange, Insertion, KvReaderDelAdd, KvReaderFieldId, Update}; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::update::new::word_fst_builder::WordFstBuilder; use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{CboRoaringBitmapCodec, Error, GeoPoint, GlobalFieldsIdsMap, Index, Result}; +use crate::{CboRoaringBitmapCodec, Error, GeoPoint, GlobalFieldsIdsMap, Index, Prefix, Result}; /// TODO We must return some infos/stats #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")] @@ -24,10 +26,11 @@ pub fn merge_grenad_entries( rtxn: &RoTxn, index: &Index, mut global_fields_ids_map: GlobalFieldsIdsMap<'_>, -) -> Result<()> { +) -> Result { let mut buffer: Vec = Vec::new(); let mut documents_ids = index.documents_ids(rtxn)?; let mut geo_extractor = GeoExtractor::new(rtxn, index)?; + let mut merger_result = MergerResult::default(); for merger_operation in receiver { match merger_operation { @@ -59,7 +62,15 @@ pub fn merge_grenad_entries( } MergerOperation::WordDocidsMerger(merger) => { let words_fst = index.words_fst(rtxn)?; - let mut word_fst_builder = WordFstBuilder::new(&words_fst, 4)?; + let mut word_fst_builder = WordFstBuilder::new(&words_fst)?; + /// TODO make this configurable + let prefix_settings = PrefixSettings { + compute_prefixes: true, + max_prefix_length: 4, + prefix_count_threshold: 100, + }; + word_fst_builder.with_prefix_settings(prefix_settings); + { let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); @@ -80,8 +91,12 @@ pub fn merge_grenad_entries( tracing::trace_span!(target: "indexing::documents::merge", "words_fst"); let _entered = span.enter(); - let (word_fst_mmap, prefix_fst_mmap) = word_fst_builder.build()?; + let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, rtxn)?; sender.main().write_words_fst(word_fst_mmap).unwrap(); + if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data { + sender.main().write_words_prefixes_fst(prefixes_fst_mmap).unwrap(); + merger_result.prefix_delta = Some(prefix_delta); + } } } MergerOperation::WordFidDocidsMerger(merger) => { @@ -185,7 +200,13 @@ pub fn merge_grenad_entries( // ... - Ok(()) + Ok(merger_result) +} + +#[derive(Default, Debug)] +pub struct MergerResult { + /// The delta of the prefixes + pub prefix_delta: Option, } pub struct GeoExtractor { diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index dedd89497..98b60378f 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -13,6 +13,7 @@ mod items_pool; mod merger; mod top_level_map; mod word_fst_builder; +mod words_prefix_docids; /// TODO move them elsewhere pub type StdResult = std::result::Result; diff --git a/milli/src/update/new/word_fst_builder.rs b/milli/src/update/new/word_fst_builder.rs index 1e02bdc3b..6c415c17e 100644 --- a/milli/src/update/new/word_fst_builder.rs +++ b/milli/src/update/new/word_fst_builder.rs @@ -2,50 +2,37 @@ use std::{fs::File, io::BufWriter}; use fst::{Set, SetBuilder, Streamer}; use memmap2::Mmap; +use std::collections::HashSet; use tempfile::tempfile; -use crate::{update::del_add::DelAdd, Result, SmallString32}; +use crate::{update::del_add::DelAdd, Prefix, Result}; pub struct WordFstBuilder<'a> { stream: Option>, word_fst_builder: SetBuilder>, - /// TODO: Replace the full memory allocation - prefix_fst_builders: Vec>>, - max_prefix_length: usize, last_word: Option>, - current_prefix: Vec, - current_prefix_count: Vec, - prefix_count_threshold: u64, + prefix_fst_builder: Option, inserted_words: usize, registered_words: usize, - base_set_length: usize, } impl<'a> WordFstBuilder<'a> { - pub fn new( - words_fst: &'a Set>, - max_prefix_length: usize, - ) -> Result { - let mut prefix_fst_builders = Vec::new(); - for _ in 0..max_prefix_length { - prefix_fst_builders.push(SetBuilder::memory()); - } - + pub fn new(words_fst: &'a Set>) -> Result { Ok(Self { stream: Some(words_fst.stream()), word_fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?, - prefix_fst_builders, - max_prefix_length, + prefix_fst_builder: None, last_word: None, - current_prefix: vec![SmallString32::new(); max_prefix_length], - current_prefix_count: vec![0; max_prefix_length], - prefix_count_threshold: 100, inserted_words: 0, registered_words: 0, - base_set_length: words_fst.len(), }) } + pub fn with_prefix_settings(&mut self, prefix_settings: PrefixSettings) -> &Self { + self.prefix_fst_builder = PrefixFstBuilder::new(prefix_settings); + self + } + pub fn register_word(&mut self, deladd: DelAdd, right: &[u8]) -> Result<()> { if deladd == DelAdd::Addition { self.registered_words += 1; @@ -85,7 +72,7 @@ impl<'a> WordFstBuilder<'a> { // If we reach this point, it means that the stream is empty // and we need to insert the incoming word - self.insert_word(right)?; + self.insert_word(right, deladd, true)?; self.stream = Some(stream); } @@ -104,26 +91,18 @@ impl<'a> WordFstBuilder<'a> { match left.cmp(right) { std::cmp::Ordering::Less => { // We need to insert the last word from the current fst - self.insert_word(left)?; + self.insert_word(left, DelAdd::Addition, false)?; left_inserted = true; } std::cmp::Ordering::Equal => { - // Addition: We insert the word - // Deletion: We delete the word by not inserting it - if deladd == DelAdd::Addition { - self.insert_word(right)?; - } + self.insert_word(right, deladd, true)?; left_inserted = true; right_inserted = true; } std::cmp::Ordering::Greater => { - // Addition: We insert the word and keep the last word - // Deletion: We keep the current word until the left word to delete is greater or equal - if deladd == DelAdd::Addition { - self.insert_word(right)?; - } + self.insert_word(right, deladd, true)?; right_inserted = true; } @@ -132,14 +111,111 @@ impl<'a> WordFstBuilder<'a> { Ok((left_inserted, right_inserted)) } - fn insert_word(&mut self, bytes: &[u8]) -> Result<()> { - self.inserted_words += 1; - self.word_fst_builder.insert(bytes)?; + fn insert_word(&mut self, bytes: &[u8], deladd: DelAdd, is_modified: bool) -> Result<()> { + // Addition: We insert the word + // Deletion: We delete the word by not inserting it + if deladd == DelAdd::Addition { + self.inserted_words += 1; + self.word_fst_builder.insert(bytes)?; + } + if let Some(prefix_fst_builder) = self.prefix_fst_builder.as_mut() { + prefix_fst_builder.insert_word(bytes, deladd, is_modified)?; + } + + Ok(()) + } + + fn drain_stream(&mut self) -> Result<()> { + if let Some(mut stream) = self.stream.take() { + while let Some(current) = stream.next() { + self.insert_word(current, DelAdd::Addition, false)?; + } + } + + Ok(()) + } + + pub fn build( + mut self, + index: &crate::Index, + rtxn: &heed::RoTxn, + ) -> Result<(Mmap, Option)> { + self.drain_stream()?; + + /// TODO: ugly unwrap + let words_fst_file = self.word_fst_builder.into_inner()?.into_inner().unwrap(); + let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; + + let prefix_data = self + .prefix_fst_builder + .map(|prefix_fst_builder| prefix_fst_builder.build(index, rtxn)) + .transpose()?; + + Ok((words_fst_mmap, prefix_data)) + } +} + +#[derive(Debug)] +pub struct PrefixSettings { + pub prefix_count_threshold: u64, + pub max_prefix_length: usize, + pub compute_prefixes: bool, +} + +pub struct PrefixData { + pub prefixes_fst_mmap: Mmap, + pub prefix_delta: PrefixDelta, +} + +#[derive(Debug)] +pub struct PrefixDelta { + pub modified: HashSet, + pub deleted: HashSet, +} + +struct PrefixFstBuilder { + prefix_count_threshold: u64, + max_prefix_length: usize, + /// TODO: Replace the full memory allocation + prefix_fst_builders: Vec>>, + current_prefix: Vec, + current_prefix_count: Vec, + modified_prefixes: HashSet, + current_prefix_is_modified: Vec, +} + +impl PrefixFstBuilder { + pub fn new(prefix_settings: PrefixSettings) -> Option { + let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } = + prefix_settings; + + if !compute_prefixes { + return None; + } + + let mut prefix_fst_builders = Vec::new(); + for _ in 0..max_prefix_length { + prefix_fst_builders.push(SetBuilder::memory()); + } + + Some(Self { + prefix_count_threshold, + max_prefix_length, + prefix_fst_builders, + current_prefix: vec![Prefix::new(); max_prefix_length], + current_prefix_count: vec![0; max_prefix_length], + modified_prefixes: HashSet::new(), + current_prefix_is_modified: vec![false; max_prefix_length], + }) + } + + fn insert_word(&mut self, bytes: &[u8], deladd: DelAdd, is_modified: bool) -> Result<()> { for n in 0..self.max_prefix_length { let current_prefix = &mut self.current_prefix[n]; let current_prefix_count = &mut self.current_prefix_count[n]; let builder = &mut self.prefix_fst_builders[n]; + let current_prefix_is_modified = &mut self.current_prefix_is_modified[n]; // We try to get the first n bytes out of this string but we only want // to split at valid characters bounds. If we try to split in the middle of @@ -153,43 +229,36 @@ impl<'a> WordFstBuilder<'a> { // This is the first iteration of the loop, // or the current word doesn't starts with the current prefix. if *current_prefix_count == 0 || prefix != current_prefix.as_str() { - *current_prefix = SmallString32::from(prefix); + *current_prefix = Prefix::from(prefix); *current_prefix_count = 0; + *current_prefix_is_modified = false; } - *current_prefix_count += 1; + *current_prefix_is_modified |= is_modified; + + if deladd == DelAdd::Addition { + *current_prefix_count += 1; + } // There is enough words corresponding to this prefix to add it to the cache. - /// TODO: (LEGACY) Replace this by `==` to avoid inserting several times the same prefix? - if *current_prefix_count >= self.prefix_count_threshold { + if *current_prefix_count == self.prefix_count_threshold { builder.insert(prefix)?; + + if *current_prefix_is_modified { + self.modified_prefixes.insert(current_prefix.clone()); + } } } Ok(()) } - fn drain_stream(&mut self) -> Result<()> { - if let Some(mut stream) = self.stream.take() { - while let Some(current) = stream.next() { - self.insert_word(current)?; - } - } - - Ok(()) - } - - pub fn build(mut self) -> Result<(Mmap, Mmap)> { - self.drain_stream()?; - - /// TODO: ugly unwrap - let words_fst_file = self.word_fst_builder.into_inner()?.into_inner().unwrap(); - let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; - + fn build(self, index: &crate::Index, rtxn: &heed::RoTxn) -> Result { // We merge all of the previously computed prefixes into on final set. let mut prefix_fsts = Vec::new(); - for builder in self.prefix_fst_builders { - prefix_fsts.push(builder.into_set()); + for builder in self.prefix_fst_builders.into_iter() { + let prefix_fst = builder.into_set(); + prefix_fsts.push(prefix_fst); } let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); let mut builder = SetBuilder::new(BufWriter::new(tempfile()?))?; @@ -197,14 +266,22 @@ impl<'a> WordFstBuilder<'a> { /// TODO: ugly unwrap let prefix_fst_file = builder.into_inner()?.into_inner().unwrap(); let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? }; + let new_prefix_fst = Set::new(&prefix_fst_mmap)?; + let old_prefix_fst = index.words_prefixes_fst(rtxn)?; + let mut deleted_prefixes = HashSet::new(); + { + let mut deleted_prefixes_stream = old_prefix_fst.op().add(&new_prefix_fst).difference(); + while let Some(prefix) = deleted_prefixes_stream.next() { + deleted_prefixes.insert(Prefix::from(std::str::from_utf8(prefix)?)); + } + } - eprintln!("================================================"); - eprintln!( - "inserted words: {}, registered words: {}, base set len: {}", - self.inserted_words, self.registered_words, self.base_set_length - ); - eprintln!("================================================"); - - Ok((words_fst_mmap, prefix_fst_mmap)) + Ok(PrefixData { + prefixes_fst_mmap: prefix_fst_mmap, + prefix_delta: PrefixDelta { + modified: self.modified_prefixes, + deleted: deleted_prefixes, + }, + }) } } diff --git a/milli/src/update/new/words_prefix_docids.rs b/milli/src/update/new/words_prefix_docids.rs new file mode 100644 index 000000000..32a22ba73 --- /dev/null +++ b/milli/src/update/new/words_prefix_docids.rs @@ -0,0 +1,108 @@ +use std::collections::HashSet; + +use heed::Database; +use heed::{types::Bytes, RwTxn}; +use roaring::RoaringBitmap; + +use crate::{CboRoaringBitmapCodec, Index, Prefix, Result}; + +struct WordPrefixDocids { + database: Database, + prefix_database: Database, +} + +impl WordPrefixDocids { + fn new( + database: Database, + prefix_database: Database, + ) -> WordPrefixDocids { + WordPrefixDocids { database, prefix_database } + } + + fn execute( + self, + wtxn: &mut heed::RwTxn, + prefix_to_compute: &HashSet, + prefix_to_delete: &HashSet, + ) -> Result<()> { + self.delete_prefixes(wtxn, prefix_to_delete)?; + self.recompute_modified_prefixes(wtxn, prefix_to_compute) + } + + #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] + fn delete_prefixes(&self, wtxn: &mut heed::RwTxn, prefixes: &HashSet) -> Result<()> { + // We remove all the entries that are no more required in this word prefix docids database. + for prefix in prefixes { + let prefix = prefix.as_bytes(); + if !self.prefix_database.delete(wtxn, prefix)? { + unreachable!("We tried to delete an unknown key") + } + } + + Ok(()) + } + + #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] + fn recompute_modified_prefixes( + &self, + wtxn: &mut RwTxn, + prefixes: &HashSet, + ) -> Result<()> { + // We fetch the docids associated to the newly added word prefix fst only. + let mut docids = RoaringBitmap::new(); + for prefix in prefixes { + docids.clear(); + let prefix = prefix.as_bytes(); + for result in self.database.prefix_iter(wtxn, prefix)? { + let (_word, data) = result?; + docids |= &data; + } + + self.prefix_database.put(wtxn, prefix, &docids)?; + } + + Ok(()) + } +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] +pub fn compute_word_prefix_docids( + wtxn: &mut RwTxn, + index: &Index, + prefix_to_compute: &HashSet, + prefix_to_delete: &HashSet, +) -> Result<()> { + WordPrefixDocids::new( + index.word_docids.remap_key_type(), + index.word_prefix_docids.remap_key_type(), + ) + .execute(wtxn, prefix_to_compute, prefix_to_delete) +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] +pub fn compute_word_prefix_fid_docids( + wtxn: &mut RwTxn, + index: &Index, + prefix_to_compute: &HashSet, + prefix_to_delete: &HashSet, +) -> Result<()> { + WordPrefixDocids::new( + index.word_fid_docids.remap_key_type(), + index.word_prefix_fid_docids.remap_key_type(), + ) + .execute(wtxn, prefix_to_compute, prefix_to_delete) +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] +pub fn compute_word_prefix_position_docids( + wtxn: &mut RwTxn, + index: &Index, + prefix_to_compute: &HashSet, + prefix_to_delete: &HashSet, +) -> Result<()> { + WordPrefixDocids::new( + index.word_position_docids.remap_key_type(), + index.word_prefix_position_docids.remap_key_type(), + ) + .execute(wtxn, prefix_to_compute, prefix_to_delete) +} From 5b776556feae34dc2a675f99996a169f57f7e289 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 26 Sep 2024 17:46:58 +0200 Subject: [PATCH 105/247] Add ParallelIteratorExt --- milli/src/update/new/items_pool.rs | 32 ++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/milli/src/update/new/items_pool.rs b/milli/src/update/new/items_pool.rs index e90ce97db..c57bc86f1 100644 --- a/milli/src/update/new/items_pool.rs +++ b/milli/src/update/new/items_pool.rs @@ -1,4 +1,36 @@ use crossbeam_channel::{Receiver, Sender, TryRecvError}; +use rayon::iter::{MapInit, ParallelIterator}; + +pub trait ParallelIteratorExt: ParallelIterator { + fn try_map_try_init( + self, + init: INIT, + map_op: F, + ) -> MapInit< + Self, + impl Fn() -> Result> + Sync + Send + Clone, + impl Fn(&mut Result>, Self::Item) -> Result> + Sync + Send + Clone, + > + where + E: Send, + F: Fn(&mut T, Self::Item) -> Result + Sync + Send + Clone, + INIT: Fn() -> Result + Sync + Send + Clone, + R: Send, + { + self.map_init( + move || match init() { + Ok(t) => Ok(t), + Err(err) => Err(Some(err)), + }, + move |maybe_t, item| match maybe_t { + Ok(t) => map_op(t, item).map_err(Some), + Err(maybe_err) => Err(maybe_err.take()), + }, + ) + } +} + +impl ParallelIteratorExt for T {} /// A pool of items that can be pull and generated on demand. pub struct ItemsPool From 8cb5e7437d180603438f958cc3fe8ded026ffd9b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 26 Sep 2024 17:48:32 +0200 Subject: [PATCH 106/247] try using try_map_try_init --- .../update/new/indexer/document_operation.rs | 40 ++++++++++--------- milli/src/update/new/indexer/mod.rs | 13 ++++-- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 572ea8528..66f981bdd 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -13,6 +13,7 @@ use super::super::items_pool::ItemsPool; use super::super::{CowStr, TopLevelMap}; use super::DocumentChanges; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; +use crate::update::new::items_pool::ParallelIteratorExt as _; use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; @@ -73,7 +74,12 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { self, fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, - ) -> Result> + Clone + 'p> { + ) -> Result< + impl IndexedParallelIterator< + Item = std::result::Result>, + > + Clone + + 'p, + > { let (index, rtxn, primary_key) = param; let documents_ids = index.documents_ids(rtxn)?; @@ -199,24 +205,22 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { // And finally sort them docids_version_offsets.sort_unstable_by_key(|(_, (_, docops))| sort_function_key(docops)); - Ok(docids_version_offsets.into_par_iter().map_with( - Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))), - move |context_pool, (external_docid, (internal_docid, operations))| { - context_pool.with(|rtxn| { - let document_merge_function = match self.index_documents_method { - Idm::ReplaceDocuments => MergeDocumentForReplacement::merge, - Idm::UpdateDocuments => MergeDocumentForUpdates::merge, - }; + Ok(docids_version_offsets.into_par_iter().try_map_try_init( + || index.read_txn().map_err(crate::Error::from), + move |rtxn, (external_docid, (internal_docid, operations))| { + let document_merge_function = match self.index_documents_method { + Idm::ReplaceDocuments => MergeDocumentForReplacement::merge, + Idm::UpdateDocuments => MergeDocumentForUpdates::merge, + }; - document_merge_function( - rtxn, - index, - &fields_ids_map, - internal_docid, - external_docid.to_string(), // TODO do not clone - &operations, - ) - }) + document_merge_function( + rtxn, + index, + &fields_ids_map, + internal_docid, + external_docid.to_string(), // TODO do not clone + &operations, + ) }, )) } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index e30333b3a..cc8af1312 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -23,7 +23,7 @@ use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::update::new::channel::ExtractorSender; use crate::update::settings::InnerIndexSettings; use crate::update::GrenadParameters; -use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; mod document_deletion; mod document_operation; @@ -37,7 +37,12 @@ pub trait DocumentChanges<'p> { self, fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, - ) -> Result> + Clone + 'p>; + ) -> Result< + impl IndexedParallelIterator< + Item = std::result::Result>, + > + Clone + + 'p, + >; } /// This is the main function of this crate. @@ -53,7 +58,9 @@ pub fn index( document_changes: PI, ) -> Result<()> where - PI: IndexedParallelIterator> + Send + Clone, + PI: IndexedParallelIterator>> + + Send + + Clone, { let (merger_sender, writer_receiver) = merger_writer_channel(10_000); // This channel acts as a rendezvous point to ensure that we are one task ahead From 3843240940465f3a299d5949fcf53022c42b220e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Sep 2024 18:43:23 +0200 Subject: [PATCH 107/247] Prefer using Ars instead of Options --- milli/src/update/new/items_pool.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/milli/src/update/new/items_pool.rs b/milli/src/update/new/items_pool.rs index c57bc86f1..92a6d5e64 100644 --- a/milli/src/update/new/items_pool.rs +++ b/milli/src/update/new/items_pool.rs @@ -1,18 +1,21 @@ +use std::sync::Arc; + use crossbeam_channel::{Receiver, Sender, TryRecvError}; use rayon::iter::{MapInit, ParallelIterator}; pub trait ParallelIteratorExt: ParallelIterator { + /// A method on a parallel iterator to map fn try_map_try_init( self, init: INIT, map_op: F, ) -> MapInit< Self, - impl Fn() -> Result> + Sync + Send + Clone, - impl Fn(&mut Result>, Self::Item) -> Result> + Sync + Send + Clone, + impl Fn() -> Result> + Sync + Send + Clone, + impl Fn(&mut Result>, Self::Item) -> Result> + Sync + Send + Clone, > where - E: Send, + E: Send + Sync, F: Fn(&mut T, Self::Item) -> Result + Sync + Send + Clone, INIT: Fn() -> Result + Sync + Send + Clone, R: Send, @@ -20,11 +23,11 @@ pub trait ParallelIteratorExt: ParallelIterator { self.map_init( move || match init() { Ok(t) => Ok(t), - Err(err) => Err(Some(err)), + Err(err) => Err(Arc::new(err)), }, move |maybe_t, item| match maybe_t { - Ok(t) => map_op(t, item).map_err(Some), - Err(maybe_err) => Err(maybe_err.take()), + Ok(t) => map_op(t, item).map_err(Arc::new), + Err(maybe_err) => Err(maybe_err.clone()), }, ) } From 31de5c747e05d7c7109114a8440e361cc9d7a7e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Sep 2024 18:59:28 +0200 Subject: [PATCH 108/247] WIP using try_map_try_init --- .../new/extract/faceted/extract_facets.rs | 9 ++++++-- milli/src/update/new/extract/mod.rs | 7 ++++-- .../src/update/new/extract/searchable/mod.rs | 11 ++++++--- .../update/new/indexer/document_deletion.rs | 23 +++++++++++-------- .../update/new/indexer/document_operation.rs | 7 +++--- milli/src/update/new/indexer/mod.rs | 11 ++++----- milli/src/update/new/indexer/partial_dump.rs | 6 ++++- .../update/new/indexer/update_by_function.rs | 10 ++++++-- 8 files changed, 55 insertions(+), 29 deletions(-) diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 41bce2215..17de26831 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use std::fmt::Debug; use std::fs::File; +use std::sync::Arc; use grenad::{MergeFunction, Merger}; use heed::RoTxn; @@ -14,7 +15,9 @@ use crate::facet::value_encoding::f64_into_bytes; use crate::update::new::extract::DocidsExtractor; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result, MAX_FACET_VALUE_LENGTH}; +use crate::{ + DocumentId, Error, FieldId, GlobalFieldsIdsMap, Index, Result, MAX_FACET_VALUE_LENGTH, +}; pub struct FacetedDocidsExtractor; impl FacetedDocidsExtractor { @@ -195,7 +198,9 @@ impl DocidsExtractor for FacetedDocidsExtractor { index: &Index, fields_ids_map: &GlobalFieldsIdsMap, indexer: GrenadParameters, - document_changes: impl IntoParallelIterator>, + document_changes: impl IntoParallelIterator< + Item = std::result::Result>, + >, ) -> Result> { let max_memory = indexer.max_memory_by_thread(); diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 6e60a4063..c12634563 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -4,6 +4,7 @@ mod lru; mod searchable; use std::fs::File; +use std::sync::Arc; pub use faceted::*; use grenad::Merger; @@ -12,14 +13,16 @@ pub use searchable::*; use super::DocumentChange; use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{GlobalFieldsIdsMap, Index, Result}; +use crate::{Error, GlobalFieldsIdsMap, Index, Result}; pub trait DocidsExtractor { fn run_extraction( index: &Index, fields_ids_map: &GlobalFieldsIdsMap, indexer: GrenadParameters, - document_changes: impl IntoParallelIterator>, + document_changes: impl IntoParallelIterator< + Item = std::result::Result>, + >, ) -> Result>; } diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index c79bd4766..2557862a2 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -3,6 +3,7 @@ mod extract_word_pair_proximity_docids; mod tokenize_document; use std::fs::File; +use std::sync::Arc; pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers}; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; @@ -15,14 +16,16 @@ use super::cache::CboCachedSorter; use super::DocidsExtractor; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{Error, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; pub trait SearchableExtractor { fn run_extraction( index: &Index, fields_ids_map: &GlobalFieldsIdsMap, indexer: GrenadParameters, - document_changes: impl IntoParallelIterator>, + document_changes: impl IntoParallelIterator< + Item = std::result::Result>, + >, ) -> Result> { let max_memory = indexer.max_memory_by_thread(); @@ -132,7 +135,9 @@ impl DocidsExtractor for T { index: &Index, fields_ids_map: &GlobalFieldsIdsMap, indexer: GrenadParameters, - document_changes: impl IntoParallelIterator>, + document_changes: impl IntoParallelIterator< + Item = std::result::Result>, + >, ) -> Result> { Self::run_extraction(index, fields_ids_map, indexer, document_changes) } diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index bad72d3b2..eab4331b6 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -1,11 +1,12 @@ use std::sync::Arc; -use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; +use rayon::iter::{IndexedParallelIterator, IntoParallelIterator}; use roaring::RoaringBitmap; use super::DocumentChanges; -use crate::update::new::{Deletion, DocumentChange, ItemsPool}; -use crate::{FieldsIdsMap, Index, Result}; +use crate::update::new::items_pool::ParallelIteratorExt as _; +use crate::update::new::{Deletion, DocumentChange}; +use crate::{Error, FieldsIdsMap, Index, Result}; pub struct DocumentDeletion { pub to_delete: RoaringBitmap, @@ -28,15 +29,19 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion { self, _fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, - ) -> Result> + Clone + 'p> { + ) -> Result< + impl IndexedParallelIterator>> + + Clone + + 'p, + > { let index = param; - let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))); let to_delete: Vec<_> = self.to_delete.into_iter().collect(); - Ok(to_delete.into_par_iter().map_with(items, |items, docid| { - items.with(|rtxn| { + Ok(to_delete.into_par_iter().try_map_try_init( + || index.read_txn().map_err(crate::Error::from), + |rtxn, docid| { let current = index.document(rtxn, docid)?; Ok(DocumentChange::Deletion(Deletion::create(docid, current.boxed()))) - }) - })) + }, + )) } } diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 66f981bdd..b2dc67ce1 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -75,9 +75,8 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, ) -> Result< - impl IndexedParallelIterator< - Item = std::result::Result>, - > + Clone + impl IndexedParallelIterator>> + + Clone + 'p, > { let (index, rtxn, primary_key) = param; @@ -206,7 +205,7 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { docids_version_offsets.sort_unstable_by_key(|(_, (_, docops))| sort_function_key(docops)); Ok(docids_version_offsets.into_par_iter().try_map_try_init( - || index.read_txn().map_err(crate::Error::from), + || index.read_txn().map_err(Error::from), move |rtxn, (external_docid, (internal_docid, operations))| { let document_merge_function = match self.index_documents_method { Idm::ReplaceDocuments => MergeDocumentForReplacement::merge, diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index cc8af1312..caae956af 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -1,4 +1,4 @@ -use std::sync::RwLock; +use std::sync::{Arc, RwLock}; use std::thread::{self, Builder}; use big_s::S; @@ -38,9 +38,8 @@ pub trait DocumentChanges<'p> { fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, ) -> Result< - impl IndexedParallelIterator< - Item = std::result::Result>, - > + Clone + impl IndexedParallelIterator>> + + Clone + 'p, >; } @@ -58,7 +57,7 @@ pub fn index( document_changes: PI, ) -> Result<()> where - PI: IndexedParallelIterator>> + PI: IndexedParallelIterator>> + Send + Clone, { @@ -249,7 +248,7 @@ fn extract_and_send_docids( index: &Index, fields_ids_map: &GlobalFieldsIdsMap, indexer: GrenadParameters, - document_changes: impl IntoParallelIterator>, + document_changes: impl IntoParallelIterator>>, sender: &ExtractorSender, ) -> Result<()> { let merger = E::run_extraction(index, fields_ids_map, indexer, document_changes)?; diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 43a89c46c..02c9d68fc 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -30,7 +30,11 @@ where self, _fields_ids_map: &mut FieldsIdsMap, param: Self::Parameter, - ) -> Result> + Clone + 'p> { + ) -> Result< + impl IndexedParallelIterator>> + + Clone + + 'p, + > { let (fields_ids_map, concurrent_available_ids, primary_key) = param; Ok(self.iter.map(|object| { diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index d4c0f837b..d6d532433 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -1,8 +1,10 @@ +use std::sync::Arc; + use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; use super::DocumentChanges; use crate::update::new::DocumentChange; -use crate::{FieldsIdsMap, Result}; +use crate::{Error, FieldsIdsMap, Result}; pub struct UpdateByFunction; @@ -13,7 +15,11 @@ impl<'p> DocumentChanges<'p> for UpdateByFunction { self, _fields_ids_map: &mut FieldsIdsMap, _param: Self::Parameter, - ) -> Result> + Clone + 'p> { + ) -> Result< + impl IndexedParallelIterator>> + + Clone + + 'p, + > { Ok((0..100).into_par_iter().map(|_| todo!())) } } From f3356ddaa4f551c672bc937c63b5a39b0693c786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 29 Sep 2024 16:46:58 +0200 Subject: [PATCH 109/247] Fix the errors when using the try_map_try_init method --- .../extract/searchable/extract_word_docids.rs | 7 +- milli/src/update/new/indexer/partial_dump.rs | 76 ++++++++++--------- milli/src/update/new/items_pool.rs | 4 +- 3 files changed, 50 insertions(+), 37 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index caab170a4..828219b41 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::fs::File; use std::num::NonZero; +use std::sync::Arc; use grenad::{Merger, MergerBuilder}; use heed::RoTxn; @@ -12,7 +13,7 @@ use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{ - bucketed_position, DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result, + bucketed_position, DocumentId, Error, FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE, }; @@ -303,7 +304,9 @@ impl WordDocidsExtractors { index: &Index, fields_ids_map: &GlobalFieldsIdsMap, indexer: GrenadParameters, - document_changes: impl IntoParallelIterator>, + document_changes: impl IntoParallelIterator< + Item = std::result::Result>, + >, ) -> Result { let max_memory = indexer.max_memory_by_thread(); diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 02c9d68fc..aa01f6547 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -1,8 +1,11 @@ -use rayon::iter::IndexedParallelIterator; +use std::sync::Arc; + +use rayon::iter::{IndexedParallelIterator, ParallelBridge, ParallelIterator}; use super::DocumentChanges; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; +use crate::update::new::items_pool::ParallelIteratorExt; use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId}; use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; @@ -37,41 +40,46 @@ where > { let (fields_ids_map, concurrent_available_ids, primary_key) = param; - Ok(self.iter.map(|object| { - let docid = match concurrent_available_ids.next() { - Some(id) => id, - None => return Err(Error::UserError(UserError::DocumentLimitReached)), - }; + Ok(self.iter.try_map_try_init( + || Ok(()), + |_, object| { + let docid = match concurrent_available_ids.next() { + Some(id) => id, + None => return Err(Error::UserError(UserError::DocumentLimitReached)), + }; - let mut writer = KvWriterFieldId::memory(); - object.iter().for_each(|(key, value)| { - let key = fields_ids_map.id(key).unwrap(); - /// TODO better error management - let value = serde_json::to_vec(&value).unwrap(); - /// TODO it is not ordered - writer.insert(key, value).unwrap(); - }); + let mut writer = KvWriterFieldId::memory(); + object.iter().for_each(|(key, value)| { + let key = fields_ids_map.id(key).unwrap(); + /// TODO better error management + let value = serde_json::to_vec(&value).unwrap(); + /// TODO it is not ordered + writer.insert(key, value).unwrap(); + }); - let document = writer.into_boxed(); - let external_docid = match primary_key.document_id(&document, fields_ids_map)? { - Ok(document_id) => Ok(document_id), - Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error), - Err(DocumentIdExtractionError::MissingDocumentId) => { - Err(UserError::MissingDocumentId { - primary_key: primary_key.name().to_string(), - document: all_obkv_to_json(&document, fields_ids_map)?, - }) - } - Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { - Err(UserError::TooManyDocumentIds { - primary_key: primary_key.name().to_string(), - document: all_obkv_to_json(&document, fields_ids_map)?, - }) - } - }?; + let document = writer.into_boxed(); + let external_docid = match primary_key.document_id(&document, fields_ids_map)? { + Ok(document_id) => Ok(document_id), + Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => { + Err(user_error) + } + Err(DocumentIdExtractionError::MissingDocumentId) => { + Err(UserError::MissingDocumentId { + primary_key: primary_key.name().to_string(), + document: all_obkv_to_json(&document, fields_ids_map)?, + }) + } + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: all_obkv_to_json(&document, fields_ids_map)?, + }) + } + }?; - let insertion = Insertion::create(docid, document); - Ok(DocumentChange::Insertion(insertion)) - })) + let insertion = Insertion::create(docid, document); + Ok(DocumentChange::Insertion(insertion)) + }, + )) } } diff --git a/milli/src/update/new/items_pool.rs b/milli/src/update/new/items_pool.rs index 92a6d5e64..01a2cf933 100644 --- a/milli/src/update/new/items_pool.rs +++ b/milli/src/update/new/items_pool.rs @@ -4,7 +4,9 @@ use crossbeam_channel::{Receiver, Sender, TryRecvError}; use rayon::iter::{MapInit, ParallelIterator}; pub trait ParallelIteratorExt: ParallelIterator { - /// A method on a parallel iterator to map + /// Maps items based on the init function. + /// + /// The init function is ran only as necessary which is basically once by thread. fn try_map_try_init( self, init: INIT, From d83c9a4074a9d03373f97a3f9ebbfdd3c5ef1a6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 29 Sep 2024 17:21:11 +0200 Subject: [PATCH 110/247] Introduce the try_for_each_try_init method to be used with Arced Errors --- .../new/extract/faceted/extract_facets.rs | 33 +++++++++++-------- .../extract/searchable/extract_word_docids.rs | 31 ++++++++++------- .../src/update/new/extract/searchable/mod.rs | 31 ++++++++++------- milli/src/update/new/indexer/mod.rs | 5 +-- milli/src/update/new/items_pool.rs | 31 +++++++++++++++-- 5 files changed, 89 insertions(+), 42 deletions(-) diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 17de26831..d2daf756a 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -13,6 +13,7 @@ use super::facet_document::extract_document_facets; use super::FacetKind; use crate::facet::value_encoding::f64_into_bytes; use crate::update::new::extract::DocidsExtractor; +use crate::update::new::items_pool::ParallelIteratorExt; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{ @@ -211,7 +212,7 @@ impl DocidsExtractor for FacetedDocidsExtractor { let context_pool = ItemsPool::new(|| { Ok(( - index.read_txn()?, + index.read_txn().map_err(Error::from).map_err(Arc::new)?, fields_ids_map.clone(), Vec::new(), CboCachedSorter::new( @@ -233,19 +234,23 @@ impl DocidsExtractor for FacetedDocidsExtractor { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - document_changes.into_par_iter().try_for_each(|document_change| { - context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| { - Self::extract_document_change( - &*rtxn, - index, - buffer, - fields_ids_map, - &attributes_to_extract, - cached_sorter, - document_change?, - ) - }) - })?; + document_changes.into_par_iter().try_for_each_try_init( + || Ok(()), + |_, document_change| { + context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + buffer, + fields_ids_map, + &attributes_to_extract, + cached_sorter, + document_change?, + ) + .map_err(Arc::new) + }) + }, + )?; } { let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 828219b41..a9552b499 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -10,6 +10,7 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; +use crate::update::new::items_pool::ParallelIteratorExt; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{ @@ -340,7 +341,7 @@ impl WordDocidsExtractors { let context_pool = ItemsPool::new(|| { Ok(( - index.read_txn()?, + index.read_txn().map_err(Error::from).map_err(Arc::new)?, &document_tokenizer, fields_ids_map.clone(), WordDocidsCachedSorters::new( @@ -356,18 +357,24 @@ impl WordDocidsExtractors { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - document_changes.into_par_iter().try_for_each(|document_change| { - context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { - Self::extract_document_change( - &*rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, + document_changes.into_par_iter().try_for_each_try_init( + || Ok(()), + |_, document_change| { + context_pool.with( + |(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + document_tokenizer, + fields_ids_map, + cached_sorter, + document_change?, + ) + .map_err(Arc::new) + }, ) - }) - })?; + }, + )?; } { diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 2557862a2..b3f27ec78 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -14,6 +14,7 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; use super::DocidsExtractor; +use crate::update::new::items_pool::ParallelIteratorExt; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{Error, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; @@ -59,7 +60,7 @@ pub trait SearchableExtractor { let context_pool = ItemsPool::new(|| { Ok(( - index.read_txn()?, + index.read_txn().map_err(Error::from).map_err(Arc::new)?, &document_tokenizer, fields_ids_map.clone(), CboCachedSorter::new( @@ -81,18 +82,24 @@ pub trait SearchableExtractor { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - document_changes.into_par_iter().try_for_each(|document_change| { - context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { - Self::extract_document_change( - &*rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, + document_changes.into_par_iter().try_for_each_try_init( + || Ok(()), + |_, document_change| { + context_pool.with( + |(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { + Self::extract_document_change( + &*rtxn, + index, + document_tokenizer, + fields_ids_map, + cached_sorter, + document_change?, + ) + .map_err(Arc::new) + }, ) - }) - })?; + }, + )?; } { let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index caae956af..d6064e4fb 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -22,6 +22,7 @@ use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::update::new::channel::ExtractorSender; use crate::update::settings::InnerIndexSettings; +use crate::update::new::items_pool::ParallelIteratorExt; use crate::update::GrenadParameters; use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; @@ -80,7 +81,7 @@ where // document but we need to create a function that collects and compresses documents. let document_sender = extractor_sender.document_sender(); - document_changes.clone().into_par_iter().try_for_each(|result| { + document_changes.clone().into_par_iter().try_for_each_try_init(|| Ok(()) as Result<_>, |_, result| { match result? { DocumentChange::Deletion(deletion) => { let docid = deletion.docid(); @@ -98,7 +99,7 @@ where // extracted_dictionary_sender.send(self, dictionary: &[u8]); } } - Ok(()) as Result<_> + Ok(()) as std::result::Result<_, Arc<_>> })?; document_sender.finish().unwrap(); diff --git a/milli/src/update/new/items_pool.rs b/milli/src/update/new/items_pool.rs index 01a2cf933..649f09105 100644 --- a/milli/src/update/new/items_pool.rs +++ b/milli/src/update/new/items_pool.rs @@ -1,3 +1,4 @@ +use std::convert::identity; use std::sync::Arc; use crossbeam_channel::{Receiver, Sender, TryRecvError}; @@ -27,12 +28,38 @@ pub trait ParallelIteratorExt: ParallelIterator { Ok(t) => Ok(t), Err(err) => Err(Arc::new(err)), }, - move |maybe_t, item| match maybe_t { + move |result, item| match result { Ok(t) => map_op(t, item).map_err(Arc::new), - Err(maybe_err) => Err(maybe_err.clone()), + Err(err) => Err(err.clone()), }, ) } + + /// A method to run a closure of all the items and return an owned error. + /// + /// The init function is ran only as necessary which is basically once by thread. + fn try_for_each_try_init(self, init: INIT, op: F) -> Result<(), E> + where + E: Send + Sync, + F: Fn(&mut T, Self::Item) -> Result<(), Arc> + Sync + Send + Clone, + INIT: Fn() -> Result + Sync + Send + Clone, + { + let result = self.try_for_each_init( + move || match init() { + Ok(t) => Ok(t), + Err(err) => Err(Arc::new(err)), + }, + move |result, item| match result { + Ok(t) => op(t, item), + Err(err) => Err(err.clone()), + }, + ); + + match result { + Ok(()) => Ok(()), + Err(err) => Err(Arc::into_inner(err).expect("the error must be only owned by us")), + } + } } impl ParallelIteratorExt for T {} From 00e045b24977947ed526b35afa6f5f5f775baaf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 29 Sep 2024 17:42:26 +0200 Subject: [PATCH 111/247] Rename and use the try_arc_for_each_try_init method --- .../new/extract/faceted/extract_facets.rs | 13 ++++--- .../extract/searchable/extract_word_docids.rs | 35 +++++++++---------- .../src/update/new/extract/searchable/mod.rs | 33 ++++++++--------- .../update/new/indexer/document_operation.rs | 3 +- milli/src/update/new/indexer/mod.rs | 5 +-- milli/src/update/new/indexer/partial_dump.rs | 2 +- milli/src/update/new/items_pool.rs | 14 ++++++-- 7 files changed, 54 insertions(+), 51 deletions(-) diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index d2daf756a..40f561b97 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -212,7 +212,6 @@ impl DocidsExtractor for FacetedDocidsExtractor { let context_pool = ItemsPool::new(|| { Ok(( - index.read_txn().map_err(Error::from).map_err(Arc::new)?, fields_ids_map.clone(), Vec::new(), CboCachedSorter::new( @@ -234,12 +233,12 @@ impl DocidsExtractor for FacetedDocidsExtractor { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - document_changes.into_par_iter().try_for_each_try_init( - || Ok(()), - |_, document_change| { - context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| { + document_changes.into_par_iter().try_arc_for_each_try_init( + || index.read_txn().map_err(Error::from), + |rtxn, document_change| { + context_pool.with(|(fields_ids_map, buffer, cached_sorter)| { Self::extract_document_change( - &*rtxn, + rtxn, index, buffer, fields_ids_map, @@ -261,7 +260,7 @@ impl DocidsExtractor for FacetedDocidsExtractor { let readers: Vec<_> = context_pool .into_items() .par_bridge() - .map(|(_rtxn, _tokenizer, _fields_ids_map, cached_sorter)| { + .map(|(_tokenizer, _fields_ids_map, cached_sorter)| { let sorter = cached_sorter.into_sorter()?; sorter.into_reader_cursors() }) diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index a9552b499..f59f5a03d 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use grenad::{Merger, MergerBuilder}; use heed::RoTxn; -use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use rayon::iter::IntoParallelIterator; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::CboCachedSorter; @@ -341,7 +341,6 @@ impl WordDocidsExtractors { let context_pool = ItemsPool::new(|| { Ok(( - index.read_txn().map_err(Error::from).map_err(Arc::new)?, &document_tokenizer, fields_ids_map.clone(), WordDocidsCachedSorters::new( @@ -357,22 +356,20 @@ impl WordDocidsExtractors { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - document_changes.into_par_iter().try_for_each_try_init( - || Ok(()), - |_, document_change| { - context_pool.with( - |(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { - Self::extract_document_change( - &*rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, - ) - .map_err(Arc::new) - }, - ) + document_changes.into_par_iter().try_arc_for_each_try_init( + || index.read_txn().map_err(Error::from), + |rtxn, document_change| { + context_pool.with(|(document_tokenizer, fields_ids_map, cached_sorter)| { + Self::extract_document_change( + rtxn, + index, + document_tokenizer, + fields_ids_map, + cached_sorter, + document_change?, + ) + .map_err(Arc::new) + }) }, )?; } @@ -382,7 +379,7 @@ impl WordDocidsExtractors { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); let mut builder = WordDocidsMergerBuilders::new(); - for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() { + for (_tokenizer, _fields_ids_map, cache) in context_pool.into_items() { builder.add_sorters(cache)?; } diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index b3f27ec78..b6cda3a87 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -60,7 +60,6 @@ pub trait SearchableExtractor { let context_pool = ItemsPool::new(|| { Ok(( - index.read_txn().map_err(Error::from).map_err(Arc::new)?, &document_tokenizer, fields_ids_map.clone(), CboCachedSorter::new( @@ -82,22 +81,20 @@ pub trait SearchableExtractor { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - document_changes.into_par_iter().try_for_each_try_init( - || Ok(()), - |_, document_change| { - context_pool.with( - |(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| { - Self::extract_document_change( - &*rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, - ) - .map_err(Arc::new) - }, - ) + document_changes.into_par_iter().try_arc_for_each_try_init( + || index.read_txn().map_err(Error::from), + |rtxn, document_change| { + context_pool.with(|(document_tokenizer, fields_ids_map, cached_sorter)| { + Self::extract_document_change( + rtxn, + index, + document_tokenizer, + fields_ids_map, + cached_sorter, + document_change?, + ) + .map_err(Arc::new) + }) }, )?; } @@ -110,7 +107,7 @@ pub trait SearchableExtractor { let readers: Vec<_> = context_pool .into_items() .par_bridge() - .map(|(_rtxn, _tokenizer, _fields_ids_map, cached_sorter)| { + .map(|(_tokenizer, _fields_ids_map, cached_sorter)| { let sorter = cached_sorter.into_sorter()?; sorter.into_reader_cursors() }) diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index b2dc67ce1..38d4a408f 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -5,11 +5,10 @@ use std::sync::Arc; use heed::types::Bytes; use heed::RoTxn; use memmap2::Mmap; -use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; +use rayon::iter::{IndexedParallelIterator, IntoParallelIterator}; use IndexDocumentsMethod as Idm; use super::super::document_change::DocumentChange; -use super::super::items_pool::ItemsPool; use super::super::{CowStr, TopLevelMap}; use super::DocumentChanges; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index d6064e4fb..934d0a364 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -81,7 +81,8 @@ where // document but we need to create a function that collects and compresses documents. let document_sender = extractor_sender.document_sender(); - document_changes.clone().into_par_iter().try_for_each_try_init(|| Ok(()) as Result<_>, |_, result| { + document_changes.clone().into_par_iter().try_arc_for_each::<_, Error>( + |result| { match result? { DocumentChange::Deletion(deletion) => { let docid = deletion.docid(); @@ -99,7 +100,7 @@ where // extracted_dictionary_sender.send(self, dictionary: &[u8]); } } - Ok(()) as std::result::Result<_, Arc<_>> + Ok(()) })?; document_sender.finish().unwrap(); diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index aa01f6547..db63256a6 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use rayon::iter::{IndexedParallelIterator, ParallelBridge, ParallelIterator}; +use rayon::iter::IndexedParallelIterator; use super::DocumentChanges; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; diff --git a/milli/src/update/new/items_pool.rs b/milli/src/update/new/items_pool.rs index 649f09105..8fa22b75b 100644 --- a/milli/src/update/new/items_pool.rs +++ b/milli/src/update/new/items_pool.rs @@ -1,4 +1,3 @@ -use std::convert::identity; use std::sync::Arc; use crossbeam_channel::{Receiver, Sender, TryRecvError}; @@ -38,7 +37,7 @@ pub trait ParallelIteratorExt: ParallelIterator { /// A method to run a closure of all the items and return an owned error. /// /// The init function is ran only as necessary which is basically once by thread. - fn try_for_each_try_init(self, init: INIT, op: F) -> Result<(), E> + fn try_arc_for_each_try_init(self, init: INIT, op: F) -> Result<(), E> where E: Send + Sync, F: Fn(&mut T, Self::Item) -> Result<(), Arc> + Sync + Send + Clone, @@ -60,6 +59,17 @@ pub trait ParallelIteratorExt: ParallelIterator { Err(err) => Err(Arc::into_inner(err).expect("the error must be only owned by us")), } } + + fn try_arc_for_each(self, op: F) -> Result<(), E> + where + E: Send + Sync, + F: Fn(Self::Item) -> Result<(), Arc> + Sync + Send + Clone, + { + match self.try_for_each(op) { + Ok(()) => Ok(()), + Err(err) => Err(Arc::into_inner(err).expect("the error must be only owned by us")), + } + } } impl ParallelIteratorExt for T {} From 0a8cb471dff621c963d9d86d18b6d027c893ad1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Sep 2024 23:50:43 +0200 Subject: [PATCH 112/247] Introduce the AppendOnlyVec struct for the parallel computing --- milli/src/update/new/append_only_vec.rs | 327 ++++++++++++++++++++++++ milli/src/update/new/mod.rs | 1 + 2 files changed, 328 insertions(+) create mode 100644 milli/src/update/new/append_only_vec.rs diff --git a/milli/src/update/new/append_only_vec.rs b/milli/src/update/new/append_only_vec.rs new file mode 100644 index 000000000..fe05dd782 --- /dev/null +++ b/milli/src/update/new/append_only_vec.rs @@ -0,0 +1,327 @@ +// Code taken from +// and modified in order to get a ref mut instead of the index of newly inserted items. + +//! AppendOnlyVec +//! +//! This is a pretty simple type, which is a vector that you can push into and +//! receive a reference to the item you just inserted. The data structure never +//! moves an element once allocated, so you can push to the vec even while holding +//! mutable references to elements that have already been pushed. +//! +//! ### Scaling +//! +//! 1. Accessing an element is O(1), but slightly more expensive than for a +//! standard `Vec`. +//! +//! 2. Pushing a new element amortizes to O(1), but may require allocation of a +//! new chunk. +//! +//! ### Example +//! +//! ``` +//! use append_only_vec::AppendOnlyVec; +//! +//! static V: AppendOnlyVec = AppendOnlyVec::::new(); +//! let mut threads = Vec::new(); +//! for thread_num in 0..10 { +//! threads.push(std::thread::spawn(move || { +//! for n in 0..100 { +//! let s = format!("thread {} says {}", thread_num, n); +//! let which = V.push(s.clone()); +//! assert_eq!(&which, &s); +//! } +//! })); +//! } +//! +//! for t in threads { +//! t.join(); +//! } +//! +//! assert_eq!(V.len(), 1000); +//! ``` + +use std::cell::UnsafeCell; +use std::fmt::Debug; +use std::ptr; +use std::sync::atomic::{AtomicUsize, Ordering}; + +pub struct AppendOnlyVec { + count: AtomicUsize, + _reserved: AtomicUsize, + data: [UnsafeCell<*mut T>; BITS_USED - 1 - 3], +} + +unsafe impl Send for AppendOnlyVec {} +unsafe impl Sync for AppendOnlyVec {} + +const BITS: usize = std::mem::size_of::() * 8; + +#[cfg(target_arch = "x86_64")] +const BITS_USED: usize = 48; +#[cfg(all(not(target_arch = "x86_64"), target_pointer_width = "64"))] +const BITS_USED: usize = 64; +#[cfg(target_pointer_width = "32")] +const BITS_USED: usize = 32; + +// This takes an index into a vec, and determines which data array will hold it +// (the first return value), and what the index will be into that data array +// (second return value) +// +// The ith data array holds 1< (u32, usize) { + let i = i + 8; + let bin = BITS as u32 - 1 - i.leading_zeros(); + let bin = bin - 3; + let offset = i - bin_size(bin); + (bin, offset) +} + +const fn bin_size(array: u32) -> usize { + (1 << 3) << array +} + +#[test] +fn test_indices() { + for i in 0..32 { + println!("{:3}: {} {}", i, indices(i).0, indices(i).1); + } + let mut array = 0; + let mut offset = 0; + let mut index = 0; + while index < 1000 { + index += 1; + offset += 1; + if offset >= bin_size(array) { + offset = 0; + array += 1; + } + assert_eq!(indices(index), (array, offset)); + } +} + +impl Default for AppendOnlyVec { + fn default() -> Self { + Self::new() + } +} + +impl AppendOnlyVec { + const EMPTY: UnsafeCell<*mut T> = UnsafeCell::new(ptr::null_mut()); + + /// Allocate a new empty array. + pub const fn new() -> Self { + AppendOnlyVec { + count: AtomicUsize::new(0), + _reserved: AtomicUsize::new(0), + data: [Self::EMPTY; BITS_USED - 1 - 3], + } + } + + /// Find the length of the array. + #[inline] + pub fn len(&self) -> usize { + self.count.load(Ordering::Acquire) + } + + fn layout(array: u32) -> std::alloc::Layout { + std::alloc::Layout::array::(bin_size(array)).unwrap() + } + + /// Append an element to the array and get a mutable ref to it. + /// + /// This is notable in that it doesn't require a `&mut self`, because it + /// does appropriate atomic synchronization. + pub fn push(&self, val: T) -> &mut T { + let idx = self._reserved.fetch_add(1, Ordering::Relaxed); + let (array, offset) = indices(idx); + let ptr = if self.len() < 1 + idx - offset { + // We are working on a new array, which may not have been allocated... + if offset == 0 { + // It is our job to allocate the array! The size of the array + // is determined in the self.layout method, which needs to be + // consistent with the indices function. + let layout = Self::layout(array); + let ptr = unsafe { std::alloc::alloc(layout) } as *mut T; + unsafe { + *self.data[array as usize].get() = ptr; + } + ptr + } else { + // We need to wait for the array to be allocated. + while self.len() < 1 + idx - offset { + std::hint::spin_loop(); + } + // The Ordering::Acquire semantics of self.len() ensures that + // this pointer read will get the non-null pointer allocated + // above. + unsafe { *self.data[array as usize].get() } + } + } else { + // The Ordering::Acquire semantics of self.len() ensures that + // this pointer read will get the non-null pointer allocated + // above. + unsafe { *self.data[array as usize].get() } + }; + + // The contents of this offset are guaranteed to be unused (so far) + // because we got the idx from our fetch_add above, and ptr is + // guaranteed to be valid because of the loop we used above, which used + // self.len() which has Ordering::Acquire semantics. + unsafe { (ptr.add(offset)).write(val) }; + + // Now we need to increase the size of the vec, so it can get read. We + // use Release upon success, to ensure that the value which we wrote is + // visible to any thread that has confirmed that the count is big enough + // to read that element. In case of failure, we can be relaxed, since + // we don't do anything with the result other than try again. + while self + .count + .compare_exchange(idx, idx + 1, Ordering::Release, Ordering::Relaxed) + .is_err() + { + // This means that someone else *started* pushing before we started, + // but hasn't yet finished. We have to wait for them to finish + // pushing before we can update the count. Note that using a + // spinloop here isn't really ideal, but except when allocating a + // new array, the window between reserving space and using it is + // pretty small, so contention will hopefully be rare, and having a + // context switch during that interval will hopefully be vanishingly + // unlikely. + std::hint::spin_loop(); + } + + unsafe { &mut *ptr } + } + + /// Convert into a standard `Vec`. + pub fn into_vec(self) -> Vec { + let mut vec = Vec::with_capacity(self.len()); + + for idx in 0..self.len() { + let (array, offset) = indices(idx); + // We use a Relaxed load of the pointer, because the loop above (which + // ends before `self.len()`) should ensure that the data we want is + // already visible, since it Acquired `self.count` which synchronizes + // with the write in `self.push`. + let ptr = unsafe { *self.data[array as usize].get() }; + + // Copy the element value. The copy remaining in the array must not + // be used again (i.e. make sure we do not drop it) + let value = unsafe { ptr.add(offset).read() }; + + vec.push(value); + } + + // Prevent dropping the copied-out values by marking the count as 0 before + // our own drop is run + self.count.store(0, Ordering::Relaxed); + + vec + } +} + +impl Debug for AppendOnlyVec { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AppendOnlyVec").field("len", &self.len()).finish() + } +} + +impl Drop for AppendOnlyVec { + fn drop(&mut self) { + // First we'll drop all the `T` in a slightly sloppy way. FIXME this + // could be optimized to avoid reloading the `ptr`. + for idx in 0..self.len() { + let (array, offset) = indices(idx); + // We use a Relaxed load of the pointer, because the loop above (which + // ends before `self.len()`) should ensure that the data we want is + // already visible, since it Acquired `self.count` which synchronizes + // with the write in `self.push`. + let ptr = unsafe { *self.data[array as usize].get() }; + unsafe { + ptr::drop_in_place(ptr.add(offset)); + } + } + // Now we will free all the arrays. + for array in 0..self.data.len() as u32 { + // This load is relaxed because no other thread can have a reference + // to Self because we have a &mut self. + let ptr = unsafe { *self.data[array as usize].get() }; + if !ptr.is_null() { + let layout = Self::layout(array); + unsafe { std::alloc::dealloc(ptr as *mut u8, layout) }; + } else { + break; + } + } + } +} + +impl IntoIterator for AppendOnlyVec { + type Item = T; + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.into_vec().into_iter() + } +} + +#[test] +fn test_parallel_pushing() { + use std::sync::Arc; + let v = Arc::new(AppendOnlyVec::::new()); + let mut threads = Vec::new(); + const N: u64 = 100; + for thread_num in 0..N { + let v = v.clone(); + threads.push(std::thread::spawn(move || { + let which1 = v.push(thread_num); + let which2 = v.push(thread_num); + assert_eq!(*which1, thread_num); + assert_eq!(*which2, thread_num); + })); + } + for t in threads { + t.join().unwrap(); + } + let v = Arc::into_inner(v).unwrap().into_vec(); + for thread_num in 0..N { + assert_eq!(2, v.iter().copied().filter(|&x| x == thread_num).count()); + } +} + +#[test] +fn test_into_vec() { + struct SafeToDrop(bool); + + impl Drop for SafeToDrop { + fn drop(&mut self) { + assert!(self.0); + } + } + + let v = AppendOnlyVec::new(); + + for _ in 0..50 { + v.push(SafeToDrop(false)); + } + + let mut v = v.into_vec(); + assert_eq!(v.len(), 50); + + for i in v.iter_mut() { + i.0 = true; + } +} + +#[test] +fn test_push_then_index_mut() { + let v = AppendOnlyVec::::new(); + for i in 0..1024 { + *v.push(i) += 1; + } + + let v = v.into_vec(); + for i in 0..1024 { + assert_eq!(v[i], 2 * i); + } +} diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 98b60378f..6a48e0407 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -5,6 +5,7 @@ pub use top_level_map::{CowStr, TopLevelMap}; use super::del_add::DelAdd; use crate::FieldId; +mod append_only_vec; mod channel; mod document_change; mod extract; From dead7a56a32b2331ac9886f793dfea5b653ef7ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 1 Oct 2024 11:07:56 +0200 Subject: [PATCH 113/247] Keep the caches in the AppendOnlyVec --- milli/src/update/new/append_only_vec.rs | 12 ++-- .../new/extract/faceted/extract_facets.rs | 69 +++++++++---------- .../extract/searchable/extract_word_docids.rs | 50 +++++++------- .../src/update/new/extract/searchable/mod.rs | 67 +++++++++--------- 4 files changed, 95 insertions(+), 103 deletions(-) diff --git a/milli/src/update/new/append_only_vec.rs b/milli/src/update/new/append_only_vec.rs index fe05dd782..d4a30c1b1 100644 --- a/milli/src/update/new/append_only_vec.rs +++ b/milli/src/update/new/append_only_vec.rs @@ -99,12 +99,6 @@ fn test_indices() { } } -impl Default for AppendOnlyVec { - fn default() -> Self { - Self::new() - } -} - impl AppendOnlyVec { const EMPTY: UnsafeCell<*mut T> = UnsafeCell::new(ptr::null_mut()); @@ -220,6 +214,12 @@ impl AppendOnlyVec { } } +impl Default for AppendOnlyVec { + fn default() -> Self { + Self::new() + } +} + impl Debug for AppendOnlyVec { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("AppendOnlyVec").field("len", &self.len()).finish() diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 40f561b97..ef983c4e6 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -12,6 +12,7 @@ use super::super::cache::CboCachedSorter; use super::facet_document::extract_document_facets; use super::FacetKind; use crate::facet::value_encoding::f64_into_bytes; +use crate::update::new::append_only_vec::AppendOnlyVec; use crate::update::new::extract::DocidsExtractor; use crate::update::new::items_pool::ParallelIteratorExt; use crate::update::new::{DocumentChange, ItemsPool}; @@ -209,45 +210,40 @@ impl DocidsExtractor for FacetedDocidsExtractor { let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; let attributes_to_extract: Vec<_> = attributes_to_extract.iter().map(|s| s.as_ref()).collect(); - - let context_pool = ItemsPool::new(|| { - Ok(( - fields_ids_map.clone(), - Vec::new(), - CboCachedSorter::new( - // TODO use a better value - 100.try_into().unwrap(), - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ), - ), - )) - }); + let caches = AppendOnlyVec::new(); { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); document_changes.into_par_iter().try_arc_for_each_try_init( - || index.read_txn().map_err(Error::from), - |rtxn, document_change| { - context_pool.with(|(fields_ids_map, buffer, cached_sorter)| { - Self::extract_document_change( - rtxn, - index, - buffer, - fields_ids_map, - &attributes_to_extract, - cached_sorter, - document_change?, - ) - .map_err(Arc::new) - }) + || { + let rtxn = index.read_txn().map_err(Error::from)?; + let cache = caches.push(CboCachedSorter::new( + // TODO use a better value + 100.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + )); + Ok((rtxn, fields_ids_map.clone(), Vec::new(), cache)) + }, + |(rtxn, fields_ids_map, buffer, cached_sorter), document_change| { + Self::extract_document_change( + rtxn, + index, + buffer, + fields_ids_map, + &attributes_to_extract, + cached_sorter, + document_change?, + ) + .map_err(Arc::new) }, )?; } @@ -257,14 +253,15 @@ impl DocidsExtractor for FacetedDocidsExtractor { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); - let readers: Vec<_> = context_pool - .into_items() + let readers: Vec<_> = caches + .into_iter() .par_bridge() - .map(|(_tokenizer, _fields_ids_map, cached_sorter)| { + .map(|cached_sorter| { let sorter = cached_sorter.into_sorter()?; sorter.into_reader_cursors() }) .collect(); + for reader in readers { builder.extend(reader?); } diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index f59f5a03d..a19ac3891 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -8,6 +8,8 @@ use heed::RoTxn; use rayon::iter::IntoParallelIterator; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; +use super::SearchableExtractor; +use crate::update::new::append_only_vec::AppendOnlyVec; use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::items_pool::ParallelIteratorExt; @@ -339,37 +341,33 @@ impl WordDocidsExtractors { max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - let context_pool = ItemsPool::new(|| { - Ok(( - &document_tokenizer, - fields_ids_map.clone(), - WordDocidsCachedSorters::new( - indexer, - max_memory, - // TODO use a better value - 200_000.try_into().unwrap(), - ), - )) - }); + let caches = AppendOnlyVec::new(); { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); document_changes.into_par_iter().try_arc_for_each_try_init( - || index.read_txn().map_err(Error::from), - |rtxn, document_change| { - context_pool.with(|(document_tokenizer, fields_ids_map, cached_sorter)| { - Self::extract_document_change( - rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, - ) - .map_err(Arc::new) - }) + || { + let rtxn = index.read_txn().map_err(Error::from)?; + let cache = caches.push(WordDocidsCachedSorters::new( + indexer, + max_memory, + // TODO use a better value + 200_000.try_into().unwrap(), + )); + Ok((rtxn, &document_tokenizer, fields_ids_map.clone(), cache)) + }, + |(rtxn, document_tokenizer, fields_ids_map, cached_sorter), document_change| { + Self::extract_document_change( + rtxn, + index, + document_tokenizer, + fields_ids_map, + cached_sorter, + document_change?, + ) + .map_err(Arc::new) }, )?; } @@ -379,7 +377,7 @@ impl WordDocidsExtractors { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); let mut builder = WordDocidsMergerBuilders::new(); - for (_tokenizer, _fields_ids_map, cache) in context_pool.into_items() { + for cache in caches.into_iter() { builder.add_sorters(cache)?; } diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index b6cda3a87..f09f573e0 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -14,6 +14,7 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; use super::DocidsExtractor; +use crate::update::new::append_only_vec::AppendOnlyVec; use crate::update::new::items_pool::ParallelIteratorExt; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; @@ -57,44 +58,39 @@ pub trait SearchableExtractor { localized_attributes_rules: &localized_attributes_rules, max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - - let context_pool = ItemsPool::new(|| { - Ok(( - &document_tokenizer, - fields_ids_map.clone(), - CboCachedSorter::new( - // TODO use a better value - 1_000_000.try_into().unwrap(), - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ), - ), - )) - }); + let caches = AppendOnlyVec::new(); { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); document_changes.into_par_iter().try_arc_for_each_try_init( - || index.read_txn().map_err(Error::from), - |rtxn, document_change| { - context_pool.with(|(document_tokenizer, fields_ids_map, cached_sorter)| { - Self::extract_document_change( - rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, - ) - .map_err(Arc::new) - }) + || { + let rtxn = index.read_txn().map_err(Error::from)?; + let cache = caches.push(CboCachedSorter::new( + // TODO use a better value + 1_000_000.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + )); + Ok((rtxn, &document_tokenizer, fields_ids_map.clone(), cache)) + }, + |(rtxn, document_tokenizer, fields_ids_map, cached_sorter), document_change| { + Self::extract_document_change( + rtxn, + index, + document_tokenizer, + fields_ids_map, + cached_sorter, + document_change?, + ) + .map_err(Arc::new) }, )?; } @@ -104,14 +100,15 @@ pub trait SearchableExtractor { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); - let readers: Vec<_> = context_pool - .into_items() + let readers: Vec<_> = caches + .into_iter() .par_bridge() - .map(|(_tokenizer, _fields_ids_map, cached_sorter)| { + .map(|cached_sorter| { let sorter = cached_sorter.into_sorter()?; sorter.into_reader_cursors() }) .collect(); + for reader in readers { builder.extend(reader?); } From b7a5ba100edc4bb7e065b41908b1f949b4f2ff3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 1 Oct 2024 11:10:18 +0200 Subject: [PATCH 114/247] Move the ParallelIteratorExt into the parallel_iterator_ext module --- .../new/extract/faceted/extract_facets.rs | 4 +- .../extract/searchable/extract_word_docids.rs | 4 +- .../src/update/new/extract/searchable/mod.rs | 4 +- .../update/new/indexer/document_deletion.rs | 2 +- .../update/new/indexer/document_operation.rs | 2 +- milli/src/update/new/indexer/mod.rs | 2 +- milli/src/update/new/indexer/partial_dump.rs | 2 +- milli/src/update/new/mod.rs | 3 +- ...items_pool.rs => parallel_iterator_ext.rs} | 54 ------------------- 9 files changed, 11 insertions(+), 66 deletions(-) rename milli/src/update/new/{items_pool.rs => parallel_iterator_ext.rs} (59%) diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index ef983c4e6..e4e6f7010 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -14,8 +14,8 @@ use super::FacetKind; use crate::facet::value_encoding::f64_into_bytes; use crate::update::new::append_only_vec::AppendOnlyVec; use crate::update::new::extract::DocidsExtractor; -use crate::update::new::items_pool::ParallelIteratorExt; -use crate::update::new::{DocumentChange, ItemsPool}; +use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; +use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{ DocumentId, Error, FieldId, GlobalFieldsIdsMap, Index, Result, MAX_FACET_VALUE_LENGTH, diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index a19ac3891..f4346ba52 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -12,8 +12,8 @@ use super::SearchableExtractor; use crate::update::new::append_only_vec::AppendOnlyVec; use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; -use crate::update::new::items_pool::ParallelIteratorExt; -use crate::update::new::{DocumentChange, ItemsPool}; +use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; +use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{ bucketed_position, DocumentId, Error, FieldId, GlobalFieldsIdsMap, Index, Result, diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index f09f573e0..b3fa646b9 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -15,8 +15,8 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; use super::DocidsExtractor; use crate::update::new::append_only_vec::AppendOnlyVec; -use crate::update::new::items_pool::ParallelIteratorExt; -use crate::update::new::{DocumentChange, ItemsPool}; +use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; +use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{Error, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index eab4331b6..400b51af6 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -4,7 +4,7 @@ use rayon::iter::{IndexedParallelIterator, IntoParallelIterator}; use roaring::RoaringBitmap; use super::DocumentChanges; -use crate::update::new::items_pool::ParallelIteratorExt as _; +use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::update::new::{Deletion, DocumentChange}; use crate::{Error, FieldsIdsMap, Index, Result}; diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 38d4a408f..f9e1bb8f3 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -12,7 +12,7 @@ use super::super::document_change::DocumentChange; use super::super::{CowStr, TopLevelMap}; use super::DocumentChanges; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; -use crate::update::new::items_pool::ParallelIteratorExt as _; +use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 934d0a364..28165c3a8 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -22,7 +22,7 @@ use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::update::new::channel::ExtractorSender; use crate::update::settings::InnerIndexSettings; -use crate::update::new::items_pool::ParallelIteratorExt; +use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; use crate::update::GrenadParameters; use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index db63256a6..325e13cc4 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -5,7 +5,7 @@ use rayon::iter::IndexedParallelIterator; use super::DocumentChanges; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; -use crate::update::new::items_pool::ParallelIteratorExt; +use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId}; use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 6a48e0407..264241caa 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -1,5 +1,4 @@ pub use document_change::{Deletion, DocumentChange, Insertion, Update}; -pub use items_pool::ItemsPool; pub use top_level_map::{CowStr, TopLevelMap}; use super::del_add::DelAdd; @@ -10,8 +9,8 @@ mod channel; mod document_change; mod extract; pub mod indexer; -mod items_pool; mod merger; +mod parallel_iterator_ext; mod top_level_map; mod word_fst_builder; mod words_prefix_docids; diff --git a/milli/src/update/new/items_pool.rs b/milli/src/update/new/parallel_iterator_ext.rs similarity index 59% rename from milli/src/update/new/items_pool.rs rename to milli/src/update/new/parallel_iterator_ext.rs index 8fa22b75b..043457cfd 100644 --- a/milli/src/update/new/items_pool.rs +++ b/milli/src/update/new/parallel_iterator_ext.rs @@ -1,6 +1,5 @@ use std::sync::Arc; -use crossbeam_channel::{Receiver, Sender, TryRecvError}; use rayon::iter::{MapInit, ParallelIterator}; pub trait ParallelIteratorExt: ParallelIterator { @@ -73,56 +72,3 @@ pub trait ParallelIteratorExt: ParallelIterator { } impl ParallelIteratorExt for T {} - -/// A pool of items that can be pull and generated on demand. -pub struct ItemsPool -where - F: Fn() -> Result, -{ - init: F, - sender: Sender, - receiver: Receiver, -} - -impl ItemsPool -where - F: Fn() -> Result, -{ - /// Create a new unbounded items pool with the specified function - /// to generate items when needed. - /// - /// The `init` function will be invoked whenever a call to `with` requires new items. - pub fn new(init: F) -> Self { - let (sender, receiver) = crossbeam_channel::unbounded(); - ItemsPool { init, sender, receiver } - } - - /// Consumes the pool to retrieve all remaining items. - /// - /// This method is useful for cleaning up and managing the items once they are no longer needed. - pub fn into_items(self) -> crossbeam_channel::IntoIter { - self.receiver.into_iter() - } - - /// Allows running a function on an item from the pool, - /// potentially generating a new item if the pool is empty. - pub fn with(&self, f: G) -> Result - where - G: FnOnce(&mut T) -> Result, - { - let mut item = match self.receiver.try_recv() { - Ok(item) => item, - Err(TryRecvError::Empty) => (self.init)()?, - Err(TryRecvError::Disconnected) => unreachable!(), - }; - - // Run the user's closure with the retrieved item - let result = f(&mut item); - - if let Err(e) = self.sender.send(item) { - unreachable!("error when sending into channel {e}"); - } - - result - } -} From d79f75f63045586aa7b027c24e8157be3f7bcac1 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 2 Oct 2024 11:32:19 +0200 Subject: [PATCH 115/247] Compute and Write external-documents-ids database --- milli/src/update/new/channel.rs | 31 ++++++++++--- milli/src/update/new/document_change.rs | 44 ++++++++++++++----- .../update/new/indexer/document_deletion.rs | 20 +++++++-- .../update/new/indexer/document_operation.rs | 14 +++--- milli/src/update/new/indexer/mod.rs | 19 +++----- milli/src/update/new/indexer/partial_dump.rs | 2 +- milli/src/update/new/merger.rs | 29 ++++++++---- 7 files changed, 111 insertions(+), 48 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 10c0a706b..bd06b5123 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -122,6 +122,7 @@ pub struct WriterOperation { pub enum Database { Documents, + ExternalDocumentsIds, ExactWordDocids, FidWordCountDocids, Main, @@ -140,6 +141,7 @@ impl Database { pub fn database(&self, index: &Index) -> heed::Database { match self { Database::Documents => index.documents.remap_types(), + Database::ExternalDocumentsIds => index.external_documents_ids.remap_types(), Database::ExactWordDocids => index.exact_word_docids.remap_types(), Database::Main => index.main.remap_types(), Database::WordDocids => index.word_docids.remap_types(), @@ -431,6 +433,7 @@ impl DocumentsSender<'_> { pub fn uncompressed( &self, docid: DocumentId, + external_id: String, document: &KvReaderFieldId, ) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( @@ -440,14 +443,29 @@ impl DocumentsSender<'_> { match self.0.send(WriterOperation { database: Database::Documents, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), + }?; + + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( + external_id.as_bytes(), + &docid.to_be_bytes(), + )); + match self.0.send(WriterOperation { database: Database::ExternalDocumentsIds, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), } } - pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { + pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes())); match self.0.send(WriterOperation { database: Database::Documents, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), + }?; + + let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes())); + match self.0.send(WriterOperation { database: Database::ExternalDocumentsIds, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), } } } @@ -460,8 +478,8 @@ pub enum MergerOperation { WordPairProximityDocidsMerger(Merger), WordPositionDocidsMerger(Merger), FacetDocidsMerger(Merger), - DeleteDocument { docid: DocumentId }, - InsertDocument { docid: DocumentId, document: Box }, + DeleteDocument { docid: DocumentId, external_id: String }, + InsertDocument { docid: DocumentId, external_id: String, document: Box }, FinishedDocument, } @@ -500,18 +518,19 @@ impl DocumentSender<'_> { pub fn insert( &self, docid: DocumentId, + external_id: String, document: Box, ) -> StdResult<(), SendError<()>> { let sender = self.0.unwrap(); - match sender.send(MergerOperation::InsertDocument { docid, document }) { + match sender.send(MergerOperation::InsertDocument { docid, external_id, document }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } } - pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { + pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> { let sender = self.0.unwrap(); - match sender.send(MergerOperation::DeleteDocument { docid }) { + match sender.send(MergerOperation::DeleteDocument { docid, external_id }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index 3e6473e77..7be8d1958 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -11,19 +11,22 @@ pub enum DocumentChange { } pub struct Deletion { - docid: DocumentId, + pub docid: DocumentId, + pub external_document_id: String, current: Box, } pub struct Update { - docid: DocumentId, + pub docid: DocumentId, + pub external_document_id: String, current: Box, - new: Box, + pub new: Box, } pub struct Insertion { - docid: DocumentId, - new: Box, + pub docid: DocumentId, + pub external_document_id: String, + pub new: Box, } impl DocumentChange { @@ -37,14 +40,22 @@ impl DocumentChange { } impl Deletion { - pub fn create(docid: DocumentId, current: Box) -> Self { - Self { docid, current } + pub fn create( + docid: DocumentId, + external_document_id: String, + current: Box, + ) -> Self { + Self { docid, external_document_id, current } } pub fn docid(&self) -> DocumentId { self.docid } + pub fn external_document_id(&self) -> &str { + &self.external_document_id + } + // TODO shouldn't we use the one in self? pub fn current<'a>( &self, @@ -56,14 +67,22 @@ impl Deletion { } impl Insertion { - pub fn create(docid: DocumentId, new: Box) -> Self { - Insertion { docid, new } + pub fn create( + docid: DocumentId, + external_document_id: String, + new: Box, + ) -> Self { + Insertion { docid, external_document_id, new } } pub fn docid(&self) -> DocumentId { self.docid } + pub fn external_document_id(&self) -> &str { + &self.external_document_id + } + pub fn new(&self) -> &KvReader { self.new.as_ref() } @@ -72,16 +91,21 @@ impl Insertion { impl Update { pub fn create( docid: DocumentId, + external_document_id: String, current: Box, new: Box, ) -> Self { - Update { docid, current, new } + Update { docid, external_document_id, current, new } } pub fn docid(&self) -> DocumentId { self.docid } + pub fn external_document_id(&self) -> &str { + &self.external_document_id + } + pub fn current<'a>( &self, rtxn: &'a RoTxn, diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index 400b51af6..21d7635c9 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -4,9 +4,11 @@ use rayon::iter::{IndexedParallelIterator, IntoParallelIterator}; use roaring::RoaringBitmap; use super::DocumentChanges; +use crate::documents::PrimaryKey; +use crate::index::db_name::EXTERNAL_DOCUMENTS_IDS; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::update::new::{Deletion, DocumentChange}; -use crate::{Error, FieldsIdsMap, Index, Result}; +use crate::{Error, FieldsIdsMap, Index, InternalError, Result}; pub struct DocumentDeletion { pub to_delete: RoaringBitmap, @@ -23,7 +25,7 @@ impl DocumentDeletion { } impl<'p> DocumentChanges<'p> for DocumentDeletion { - type Parameter = &'p Index; + type Parameter = (&'p Index, &'p FieldsIdsMap, &'p PrimaryKey<'p>); fn document_changes( self, @@ -34,13 +36,23 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion { + Clone + 'p, > { - let index = param; + let (index, fields_ids_map, primary_key) = param; let to_delete: Vec<_> = self.to_delete.into_iter().collect(); Ok(to_delete.into_par_iter().try_map_try_init( || index.read_txn().map_err(crate::Error::from), |rtxn, docid| { let current = index.document(rtxn, docid)?; - Ok(DocumentChange::Deletion(Deletion::create(docid, current.boxed()))) + let external_document_id = primary_key + .document_id(¤t, fields_ids_map)? + .map_err(|_| InternalError::DatabaseMissingEntry { + db_name: EXTERNAL_DOCUMENTS_IDS, + key: None, + })?; + Ok(DocumentChange::Deletion(Deletion::create( + docid, + external_document_id, + current.boxed(), + ))) }, )) } diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index f9e1bb8f3..7341f4e5c 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -288,15 +288,17 @@ impl MergeChanges for MergeDocumentForReplacement { match current { Some(current) => { - let update = Update::create(docid, current.boxed(), new); + let update = Update::create(docid, external_docid, current.boxed(), new); Ok(DocumentChange::Update(update)) } - None => Ok(DocumentChange::Insertion(Insertion::create(docid, new))), + None => { + Ok(DocumentChange::Insertion(Insertion::create(docid, external_docid, new))) + } } } Some(InnerDocOp::Deletion) => { let deletion = match current { - Some(current) => Deletion::create(docid, current.boxed()), + Some(current) => Deletion::create(docid, external_docid, current.boxed()), None => todo!("Do that with Louis"), }; Ok(DocumentChange::Deletion(deletion)) @@ -355,7 +357,7 @@ impl MergeChanges for MergeDocumentForUpdates { if operations.is_empty() { let deletion = match current { - Some(current) => Deletion::create(docid, current.boxed()), + Some(current) => Deletion::create(docid, external_docid, current.boxed()), None => todo!("Do that with Louis"), }; return Ok(DocumentChange::Deletion(deletion)); @@ -382,11 +384,11 @@ impl MergeChanges for MergeDocumentForUpdates { match current { Some(current) => { - let update = Update::create(docid, current.boxed(), new); + let update = Update::create(docid, external_docid, current.boxed(), new); Ok(DocumentChange::Update(update)) } None => { - let insertion = Insertion::create(docid, new); + let insertion = Insertion::create(docid, external_docid, new); Ok(DocumentChange::Insertion(insertion)) } } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 28165c3a8..17de2b310 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -11,7 +11,7 @@ use rayon::ThreadPool; pub use update_by_function::UpdateByFunction; use super::channel::*; -use super::document_change::DocumentChange; +use super::document_change::{Deletion, DocumentChange, Insertion, Update}; use super::extract::*; use super::merger::merge_grenad_entries; use super::word_fst_builder::PrefixDelta; @@ -84,19 +84,14 @@ where document_changes.clone().into_par_iter().try_arc_for_each::<_, Error>( |result| { match result? { - DocumentChange::Deletion(deletion) => { - let docid = deletion.docid(); - document_sender.delete(docid).unwrap(); + DocumentChange::Deletion(Deletion { docid, external_document_id, ..}) => { + document_sender.delete(docid, external_document_id).unwrap(); } - DocumentChange::Update(update) => { - let docid = update.docid(); - let content = update.new(); - document_sender.insert(docid, content.boxed()).unwrap(); + DocumentChange::Update(Update { docid, external_document_id, new, ..}) => { + document_sender.insert(docid, external_document_id, new).unwrap(); } - DocumentChange::Insertion(insertion) => { - let docid = insertion.docid(); - let content = insertion.new(); - document_sender.insert(docid, content.boxed()).unwrap(); + DocumentChange::Insertion(Insertion { docid, external_document_id, new, ..}) => { + document_sender.insert(docid, external_document_id, new).unwrap(); // extracted_dictionary_sender.send(self, dictionary: &[u8]); } } diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 325e13cc4..08b97b931 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -77,7 +77,7 @@ where } }?; - let insertion = Insertion::create(docid, document); + let insertion = Insertion::create(docid, external_docid, document); Ok(DocumentChange::Insertion(insertion)) }, )) diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 0d80f75ec..c010a5d83 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -136,37 +136,48 @@ pub fn merge_grenad_entries( |_, _key| Ok(()), )?; } - MergerOperation::InsertDocument { docid, document } => { + MergerOperation::InsertDocument { docid, external_id, document } => { let span = tracing::trace_span!(target: "indexing::documents::merge", "insert_document"); let _entered = span.enter(); documents_ids.insert(docid); - sender.documents().uncompressed(docid, &document).unwrap(); + sender.documents().uncompressed(docid, external_id.clone(), &document).unwrap(); if let Some(geo_extractor) = geo_extractor.as_mut() { let current = index.documents.remap_data_type::().get(rtxn, &docid)?; let current: Option<&KvReaderFieldId> = current.map(Into::into); let change = match current { - Some(current) => { - DocumentChange::Update(Update::create(docid, current.boxed(), document)) - } - None => DocumentChange::Insertion(Insertion::create(docid, document)), + Some(current) => DocumentChange::Update(Update::create( + docid, + external_id, + current.boxed(), + document, + )), + None => DocumentChange::Insertion(Insertion::create( + docid, + external_id, + document, + )), }; geo_extractor.manage_change(&mut global_fields_ids_map, &change)?; } } - MergerOperation::DeleteDocument { docid } => { + MergerOperation::DeleteDocument { docid, external_id } => { let span = tracing::trace_span!(target: "indexing::documents::merge", "delete_document"); let _entered = span.enter(); if !documents_ids.remove(docid) { unreachable!("Tried deleting a document that we do not know about"); } - sender.documents().delete(docid).unwrap(); + sender.documents().delete(docid, external_id.clone()).unwrap(); if let Some(geo_extractor) = geo_extractor.as_mut() { let current = index.document(rtxn, docid)?; - let change = DocumentChange::Deletion(Deletion::create(docid, current.boxed())); + let change = DocumentChange::Deletion(Deletion::create( + docid, + external_id, + current.boxed(), + )); geo_extractor.manage_change(&mut global_fields_ids_map, &change)?; } } From 774ed2853900acd91fafbea7f6d9780fbd98c4a2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 2 Oct 2024 12:48:41 +0200 Subject: [PATCH 116/247] Fix Prefix FST when a document is modified --- milli/src/update/new/word_fst_builder.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/milli/src/update/new/word_fst_builder.rs b/milli/src/update/new/word_fst_builder.rs index 6c415c17e..97cd47e73 100644 --- a/milli/src/update/new/word_fst_builder.rs +++ b/milli/src/update/new/word_fst_builder.rs @@ -234,12 +234,18 @@ impl PrefixFstBuilder { *current_prefix_is_modified = false; } - *current_prefix_is_modified |= is_modified; - if deladd == DelAdd::Addition { *current_prefix_count += 1; } + if is_modified && !*current_prefix_is_modified { + if *current_prefix_count > self.prefix_count_threshold { + self.modified_prefixes.insert(current_prefix.clone()); + } + + *current_prefix_is_modified = true; + } + // There is enough words corresponding to this prefix to add it to the cache. if *current_prefix_count == self.prefix_count_threshold { builder.insert(prefix)?; From 14261f8f045fdaa026526fe54e4d830e9301d988 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 1 Oct 2024 16:13:08 +0200 Subject: [PATCH 117/247] Integrate facet level bulk update Only the facet bulk update has been added so far, the incremental must be completely rewritten Factorize facet merging Fix facet level extraction --- milli/src/update/new/channel.rs | 43 ++++--- .../new/extract/faceted/extract_facets.rs | 4 +- milli/src/update/new/extract/faceted/mod.rs | 8 ++ milli/src/update/new/indexer/mod.rs | 73 ++++++++--- milli/src/update/new/merger.rs | 120 +++++++++++++----- 5 files changed, 181 insertions(+), 67 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index bd06b5123..bcac0fa03 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -120,6 +120,7 @@ pub struct WriterOperation { entry: EntryOperation, } +#[derive(Debug)] pub enum Database { Documents, ExternalDocumentsIds, @@ -158,6 +159,18 @@ impl Database { } } +impl From for Database { + fn from(value: FacetKind) -> Self { + match value { + FacetKind::Number => Database::FacetIdF64NumberDocids, + FacetKind::String => Database::FacetIdStringDocids, + FacetKind::Null => Database::FacetIdIsNullDocids, + FacetKind::Empty => Database::FacetIdIsEmptyDocids, + FacetKind::Exists => Database::FacetIdExistsDocids, + } + } +} + impl WriterOperation { pub fn database(&self, index: &Index) -> heed::Database { self.database.database(index) @@ -395,8 +408,18 @@ pub struct FacetDocidsSender<'a> { impl DocidsSender for FacetDocidsSender<'_> { fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let (database, key) = self.extract_database(key); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); + let (facet_kind, key) = FacetKind::extract_from_key(key); + let database = Database::from(facet_kind); + // let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); + let entry = match facet_kind { + // skip level group size + FacetKind::String | FacetKind::Number => { + // add facet group size + let value = [&[1], value].concat(); + EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &value)) + } + _ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)), + }; match self.sender.send(WriterOperation { database, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), @@ -404,7 +427,8 @@ impl DocidsSender for FacetDocidsSender<'_> { } fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { - let (database, key) = self.extract_database(key); + let (facet_kind, key) = FacetKind::extract_from_key(key); + let database = Database::from(facet_kind); let entry = EntryOperation::Delete(KeyEntry::from_key(key)); match self.sender.send(WriterOperation { database, entry }) { Ok(()) => Ok(()), @@ -413,19 +437,6 @@ impl DocidsSender for FacetDocidsSender<'_> { } } -impl FacetDocidsSender<'_> { - fn extract_database<'a>(&self, key: &'a [u8]) -> (Database, &'a [u8]) { - let database = match FacetKind::from(key[0]) { - FacetKind::Number => Database::FacetIdF64NumberDocids, - FacetKind::String => Database::FacetIdStringDocids, - FacetKind::Null => Database::FacetIdIsNullDocids, - FacetKind::Empty => Database::FacetIdIsEmptyDocids, - FacetKind::Exists => Database::FacetIdExistsDocids, - }; - (database, &key[1..]) - } -} - pub struct DocumentsSender<'a>(&'a MergerSender); impl DocumentsSender<'_> { diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index e4e6f7010..8ffec68f3 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -129,7 +129,7 @@ impl FacetedDocidsExtractor { buffer.clear(); buffer.push(FacetKind::Number as u8); buffer.extend_from_slice(&fid.to_be_bytes()); - buffer.push(1); // level 0 + buffer.push(0); // level 0 buffer.extend_from_slice(&ordered); buffer.extend_from_slice(&n.to_be_bytes()); @@ -145,7 +145,7 @@ impl FacetedDocidsExtractor { buffer.clear(); buffer.push(FacetKind::String as u8); buffer.extend_from_slice(&fid.to_be_bytes()); - buffer.push(1); // level 0 + buffer.push(0); // level 0 buffer.extend_from_slice(truncated.as_bytes()); cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) } diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index a59c64d9a..65e90cdf4 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -4,6 +4,7 @@ mod facet_document; pub use extract_facets::FacetedDocidsExtractor; #[repr(u8)] +#[derive(Debug, Clone, Copy)] pub enum FacetKind { Number = 0, String = 1, @@ -24,3 +25,10 @@ impl From for FacetKind { } } } + +impl FacetKind { + pub fn extract_from_key<'k>(key: &'k [u8]) -> (FacetKind, &'k [u8]) { + debug_assert!(key.len() > 3); + (FacetKind::from(key[0]), &key[1..]) + } +} diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 17de2b310..4d89a839e 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -13,18 +13,19 @@ pub use update_by_function::UpdateByFunction; use super::channel::*; use super::document_change::{Deletion, DocumentChange, Insertion, Update}; use super::extract::*; -use super::merger::merge_grenad_entries; +use super::merger::{merge_grenad_entries, FacetFieldIdsDelta}; use super::word_fst_builder::PrefixDelta; use super::words_prefix_docids::{ compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, }; use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; +use crate::facet::FacetType; use crate::update::new::channel::ExtractorSender; use crate::update::settings::InnerIndexSettings; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; -use crate::update::GrenadParameters; use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +use crate::update::{FacetsUpdateBulk, GrenadParameters}; mod document_deletion; mod document_operation; @@ -71,11 +72,11 @@ where let global_fields_ids_map_clone = global_fields_ids_map.clone(); thread::scope(|s| { + let indexer_span = tracing::Span::current(); // TODO manage the errors correctly - let current_span = tracing::Span::current(); let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { - let span = tracing::trace_span!(target: "indexing::documents", parent: ¤t_span, "extract"); + let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); let _entered = span.enter(); let document_changes = document_changes.into_par_iter(); @@ -179,11 +180,11 @@ where }) })?; + let indexer_span = tracing::Span::current(); // TODO manage the errors correctly - let current_span = tracing::Span::current(); let merger_thread = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || { let span = - tracing::trace_span!(target: "indexing::documents", parent: ¤t_span, "merge"); + tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "merge"); let _entered = span.enter(); let rtxn = index.read_txn().unwrap(); merge_grenad_entries( @@ -211,17 +212,12 @@ where handle.join().unwrap()?; let merger_result = merger_thread.join().unwrap()?; - if let Some(prefix_delta) = merger_result.prefix_delta { - let span = tracing::trace_span!(target: "indexing", "prefix"); - let _entered = span.enter(); + if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta { + compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; + } - let PrefixDelta { modified, deleted } = prefix_delta; - // Compute word prefix docids - compute_word_prefix_docids(wtxn, index, &modified, &deleted)?; - // Compute word prefix fid docids - compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted)?; - // Compute word prefix position docids - compute_word_prefix_position_docids(wtxn, index, &modified, &deleted)?; + if let Some(prefix_delta) = merger_result.prefix_delta { + compute_prefix_database(index, wtxn, prefix_delta)?; } Ok(()) as Result<_> @@ -238,6 +234,51 @@ where Ok(()) } +#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] +fn compute_prefix_database( + index: &Index, + wtxn: &mut RwTxn, + prefix_delta: PrefixDelta, +) -> Result<()> { + let PrefixDelta { modified, deleted } = prefix_delta; + // Compute word prefix docids + compute_word_prefix_docids(wtxn, index, &modified, &deleted)?; + // Compute word prefix fid docids + compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted)?; + // Compute word prefix position docids + compute_word_prefix_position_docids(wtxn, index, &modified, &deleted) +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_field_ids")] +fn compute_facet_level_database( + index: &Index, + wtxn: &mut RwTxn, + facet_field_ids_delta: FacetFieldIdsDelta, +) -> Result<()> { + if let Some(modified_facet_string_ids) = facet_field_ids_delta.modified_facet_string_ids() { + let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string"); + let _entered = span.enter(); + FacetsUpdateBulk::new_not_updating_level_0( + index, + modified_facet_string_ids, + FacetType::String, + ) + .execute(wtxn)?; + } + if let Some(modified_facet_number_ids) = facet_field_ids_delta.modified_facet_number_ids() { + let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number"); + let _entered = span.enter(); + FacetsUpdateBulk::new_not_updating_level_0( + index, + modified_facet_number_ids, + FacetType::Number, + ) + .execute(wtxn)?; + } + + Ok(()) +} + /// TODO: GrenadParameters::default() should be removed in favor a passed parameter /// TODO: manage the errors correctly /// TODO: we must have a single trait that also gives the extractor type diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index c010a5d83..9751be66c 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -16,7 +16,9 @@ use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::update::new::word_fst_builder::WordFstBuilder; use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{CboRoaringBitmapCodec, Error, GeoPoint, GlobalFieldsIdsMap, Index, Prefix, Result}; +use crate::{ + CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, Prefix, Result, +}; /// TODO We must return some infos/stats #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")] @@ -188,13 +190,17 @@ pub fn merge_grenad_entries( let span = tracing::trace_span!(target: "indexing::documents::merge", "facet_docids"); let _entered = span.enter(); + let mut facet_field_ids_delta = FacetFieldIdsDelta::new(); merge_and_send_facet_docids( merger, FacetDatabases::new(index), rtxn, &mut buffer, sender.facet_docids(), + &mut facet_field_ids_delta, )?; + + merger_result.facet_field_ids_delta = Some(facet_field_ids_delta); } } } @@ -218,6 +224,8 @@ pub fn merge_grenad_entries( pub struct MergerResult { /// The delta of the prefixes pub prefix_delta: Option, + /// The field ids that have been modified + pub facet_field_ids_delta: Option, } pub struct GeoExtractor { @@ -308,20 +316,23 @@ fn merge_and_send_facet_docids( rtxn: &RoTxn<'_>, buffer: &mut Vec, docids_sender: impl DocidsSender, + facet_field_ids_delta: &mut FacetFieldIdsDelta, ) -> Result<()> { let mut merger_iter = merger.into_stream_merger_iter().unwrap(); while let Some((key, deladd)) = merger_iter.next().unwrap() { - let current = database.get(rtxn, key)?; + let current = database.get_cbo_roaring_bytes_value(rtxn, key)?; let deladd: &KvReaderDelAdd = deladd.into(); let del = deladd.get(DelAdd::Deletion); let add = deladd.get(DelAdd::Addition); match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { + facet_field_ids_delta.register_from_key(key); let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer); docids_sender.write(key, value).unwrap(); } Operation::Delete => { + facet_field_ids_delta.register_from_key(key); docids_sender.delete(key).unwrap(); } Operation::Ignore => (), @@ -331,43 +342,84 @@ fn merge_and_send_facet_docids( Ok(()) } -struct FacetDatabases { - /// Maps the facet field id and the docids for which this field exists - facet_id_exists_docids: Database, - /// Maps the facet field id and the docids for which this field is set as null - facet_id_is_null_docids: Database, - /// Maps the facet field id and the docids for which this field is considered empty - facet_id_is_empty_docids: Database, - /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. - facet_id_f64_docids: Database, - /// Maps the facet field id and ranges of strings with the docids that corresponds to them. - facet_id_string_docids: Database, +struct FacetDatabases<'a> { + index: &'a Index, } -impl FacetDatabases { - fn new(index: &Index) -> Self { - Self { - facet_id_exists_docids: index.facet_id_exists_docids.remap_types(), - facet_id_is_null_docids: index.facet_id_is_null_docids.remap_types(), - facet_id_is_empty_docids: index.facet_id_is_empty_docids.remap_types(), - facet_id_f64_docids: index.facet_id_f64_docids.remap_types(), - facet_id_string_docids: index.facet_id_string_docids.remap_types(), - } +impl<'a> FacetDatabases<'a> { + fn new(index: &'a Index) -> Self { + Self { index } } - fn get<'a>(&self, rtxn: &'a RoTxn<'_>, key: &[u8]) -> heed::Result> { - let (facet_kind, key) = self.extract_facet_kind(key); + fn get_cbo_roaring_bytes_value<'t>( + &self, + rtxn: &'t RoTxn<'_>, + key: &[u8], + ) -> heed::Result> { + let (facet_kind, key) = FacetKind::extract_from_key(key); + + let value = + super::channel::Database::from(facet_kind).database(self.index).get(rtxn, key)?; match facet_kind { - FacetKind::Exists => self.facet_id_exists_docids.get(rtxn, key), - FacetKind::Null => self.facet_id_is_null_docids.get(rtxn, key), - FacetKind::Empty => self.facet_id_is_empty_docids.get(rtxn, key), - FacetKind::Number => self.facet_id_f64_docids.get(rtxn, key), - FacetKind::String => self.facet_id_string_docids.get(rtxn, key), + // skip level group size + FacetKind::String | FacetKind::Number => Ok(value.map(|v| &v[1..])), + _ => Ok(value), + } + } +} + +#[derive(Debug)] +pub struct FacetFieldIdsDelta { + /// The field ids that have been modified + modified_facet_string_ids: HashSet, + modified_facet_number_ids: HashSet, +} + +impl FacetFieldIdsDelta { + fn new() -> Self { + Self { + modified_facet_string_ids: HashSet::new(), + modified_facet_number_ids: HashSet::new(), } } - fn extract_facet_kind<'a>(&self, key: &'a [u8]) -> (FacetKind, &'a [u8]) { - (FacetKind::from(key[0]), &key[1..]) + fn register_facet_string_id(&mut self, field_id: FieldId) { + self.modified_facet_string_ids.insert(field_id); + } + + fn register_facet_number_id(&mut self, field_id: FieldId) { + self.modified_facet_number_ids.insert(field_id); + } + + fn register_from_key(&mut self, key: &[u8]) { + let (facet_kind, field_id) = self.extract_key_data(key); + match facet_kind { + FacetKind::Number => self.register_facet_number_id(field_id), + FacetKind::String => self.register_facet_string_id(field_id), + _ => (), + } + } + + fn extract_key_data<'a>(&self, key: &'a [u8]) -> (FacetKind, FieldId) { + let facet_kind = FacetKind::from(key[0]); + let field_id = FieldId::from_be_bytes([key[1], key[2]]); + (facet_kind, field_id) + } + + pub fn modified_facet_string_ids(&self) -> Option> { + if self.modified_facet_string_ids.is_empty() { + None + } else { + Some(self.modified_facet_string_ids.iter().copied().collect()) + } + } + + pub fn modified_facet_number_ids(&self) -> Option> { + if self.modified_facet_number_ids.is_empty() { + None + } else { + Some(self.modified_facet_number_ids.iter().copied().collect()) + } } } @@ -396,11 +448,13 @@ fn merge_cbo_bitmaps( (Some(current), None, Some(add)) => Ok(Operation::Write(current | add)), (Some(current), Some(del), add) => { let output = match add { - Some(add) => (current - del) | add, - None => current - del, + Some(add) => (¤t - del) | add, + None => ¤t - del, }; if output.is_empty() { Ok(Operation::Delete) + } else if current == output { + Ok(Operation::Ignore) } else { Ok(Operation::Write(output)) } From 35f78b542350716e3dc28098ce6bf24734af0abb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 3 Oct 2024 10:40:31 +0200 Subject: [PATCH 118/247] TO REMOVE: usefull debug prints --- milli/src/update/new/indexer/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 4d89a839e..f231527f6 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -240,6 +240,7 @@ fn compute_prefix_database( wtxn: &mut RwTxn, prefix_delta: PrefixDelta, ) -> Result<()> { + eprintln!("prefix_delta: {:?}", &prefix_delta); let PrefixDelta { modified, deleted } = prefix_delta; // Compute word prefix docids compute_word_prefix_docids(wtxn, index, &modified, &deleted)?; @@ -255,6 +256,7 @@ fn compute_facet_level_database( wtxn: &mut RwTxn, facet_field_ids_delta: FacetFieldIdsDelta, ) -> Result<()> { + eprintln!("facet_field_ids_delta: {:?}", &facet_field_ids_delta); if let Some(modified_facet_string_ids) = facet_field_ids_delta.modified_facet_string_ids() { let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string"); let _entered = span.enter(); From 0409a26cd881a37d1f30fad272247256607f811f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Oct 2024 14:59:29 +0200 Subject: [PATCH 119/247] Replace the concurrent vec by a linked list --- .../src/update/new/append_only_linked_list.rs | 170 +++++++++ milli/src/update/new/append_only_vec.rs | 327 ------------------ .../new/extract/faceted/extract_facets.rs | 4 +- .../extract/searchable/extract_word_docids.rs | 4 +- .../src/update/new/extract/searchable/mod.rs | 4 +- milli/src/update/new/mod.rs | 2 +- 6 files changed, 177 insertions(+), 334 deletions(-) create mode 100644 milli/src/update/new/append_only_linked_list.rs delete mode 100644 milli/src/update/new/append_only_vec.rs diff --git a/milli/src/update/new/append_only_linked_list.rs b/milli/src/update/new/append_only_linked_list.rs new file mode 100644 index 000000000..88b05c0ec --- /dev/null +++ b/milli/src/update/new/append_only_linked_list.rs @@ -0,0 +1,170 @@ +use std::fmt; +use std::mem::{self, ManuallyDrop}; +use std::sync::atomic::AtomicPtr; + +/// An append-only linked-list that returns a mutable references to the pushed items. +pub struct AppendOnlyLinkedList { + head: AtomicPtr>, +} + +struct Node { + item: ManuallyDrop, + parent: AtomicPtr>, +} + +impl AppendOnlyLinkedList { + /// Creates an empty list. + pub fn new() -> AppendOnlyLinkedList { + AppendOnlyLinkedList { head: AtomicPtr::default() } + } + + /// Pushes the item at the front of the linked-list and returns a unique and mutable reference to it. + #[allow(clippy::mut_from_ref)] // the mut ref is derived from T and unique each time + pub fn push(&self, item: T) -> &mut T { + use std::sync::atomic::Ordering::{Relaxed, SeqCst}; + + let node = Box::leak(Box::new(Node { + item: ManuallyDrop::new(item), + parent: AtomicPtr::default(), + })); + + let mut head = self.head.load(SeqCst); + loop { + std::hint::spin_loop(); + match self.head.compare_exchange_weak(head, node, SeqCst, Relaxed) { + Ok(parent) => { + node.parent = AtomicPtr::new(parent); + break; + } + Err(new) => head = new, + } + } + + &mut node.item + } +} + +impl Default for AppendOnlyLinkedList { + fn default() -> Self { + Self::new() + } +} + +impl Drop for AppendOnlyLinkedList { + fn drop(&mut self) { + // Let's use the drop implementation of the IntoIter struct + IntoIter(mem::take(&mut self.head)); + } +} + +impl fmt::Debug for AppendOnlyLinkedList { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AppendOnlyLinkedList").finish() + } +} + +impl IntoIterator for AppendOnlyLinkedList { + type Item = T; + type IntoIter = IntoIter; + + fn into_iter(mut self) -> Self::IntoIter { + IntoIter(mem::take(&mut self.head)) + } +} + +pub struct IntoIter(AtomicPtr>); + +impl Iterator for IntoIter { + type Item = T; + + fn next(&mut self) -> Option { + let ptr = *self.0.get_mut(); + if ptr.is_null() { + None + } else { + let mut node = unsafe { Box::from_raw(ptr) }; + // Let's set the next node to read to be the parent of this one + self.0 = node.parent; + // ...and take the item from the Node before it is dropped + let item = unsafe { ManuallyDrop::take(&mut node.item) }; + Some(item) + // ...then drop the Node itself + } + } +} + +impl Drop for IntoIter { + fn drop(&mut self) { + let mut ptr = *self.0.get_mut(); + while !ptr.is_null() { + let mut node = unsafe { Box::from_raw(ptr) }; + // Let's set the next node to read to be the parent of this one + ptr = *node.parent.get_mut(); + // ...and drop the item ourselves. + unsafe { ManuallyDrop::drop(&mut node.item) } + // ...then drop the Node itself + } + } +} + +#[test] +fn test_parallel_pushing() { + use std::sync::Arc; + let v = Arc::new(AppendOnlyLinkedList::::new()); + let mut threads = Vec::new(); + const N: u64 = 100; + for thread_num in 0..N { + let v = v.clone(); + threads.push(std::thread::spawn(move || { + let which1 = v.push(thread_num); + let which2 = v.push(thread_num); + assert_eq!(*which1, thread_num); + assert_eq!(*which2, thread_num); + })); + } + for t in threads { + t.join().unwrap(); + } + let v = Arc::into_inner(v).unwrap().into_iter().collect::>(); + for thread_num in (0..N).rev() { + assert_eq!(2, v.iter().copied().filter(|&x| x == thread_num).count()); + } +} + +#[test] +fn test_into_vec() { + struct SafeToDrop(bool); + + impl Drop for SafeToDrop { + fn drop(&mut self) { + assert!(self.0); + } + } + + let v = AppendOnlyLinkedList::new(); + + for _ in 0..50 { + v.push(SafeToDrop(false)); + } + + let mut v = v.into_iter().collect::>(); + assert_eq!(v.len(), 50); + + for i in v.iter_mut() { + i.0 = true; + } +} + +#[test] +fn test_push_then_index_mut() { + let v = AppendOnlyLinkedList::::new(); + let mut w = Vec::new(); + for i in 0..1024 { + *v.push(i) += 1; + w.push(i + 1); + } + + let mut v = v.into_iter().collect::>(); + v.reverse(); + assert_eq!(v, w); +} diff --git a/milli/src/update/new/append_only_vec.rs b/milli/src/update/new/append_only_vec.rs deleted file mode 100644 index d4a30c1b1..000000000 --- a/milli/src/update/new/append_only_vec.rs +++ /dev/null @@ -1,327 +0,0 @@ -// Code taken from -// and modified in order to get a ref mut instead of the index of newly inserted items. - -//! AppendOnlyVec -//! -//! This is a pretty simple type, which is a vector that you can push into and -//! receive a reference to the item you just inserted. The data structure never -//! moves an element once allocated, so you can push to the vec even while holding -//! mutable references to elements that have already been pushed. -//! -//! ### Scaling -//! -//! 1. Accessing an element is O(1), but slightly more expensive than for a -//! standard `Vec`. -//! -//! 2. Pushing a new element amortizes to O(1), but may require allocation of a -//! new chunk. -//! -//! ### Example -//! -//! ``` -//! use append_only_vec::AppendOnlyVec; -//! -//! static V: AppendOnlyVec = AppendOnlyVec::::new(); -//! let mut threads = Vec::new(); -//! for thread_num in 0..10 { -//! threads.push(std::thread::spawn(move || { -//! for n in 0..100 { -//! let s = format!("thread {} says {}", thread_num, n); -//! let which = V.push(s.clone()); -//! assert_eq!(&which, &s); -//! } -//! })); -//! } -//! -//! for t in threads { -//! t.join(); -//! } -//! -//! assert_eq!(V.len(), 1000); -//! ``` - -use std::cell::UnsafeCell; -use std::fmt::Debug; -use std::ptr; -use std::sync::atomic::{AtomicUsize, Ordering}; - -pub struct AppendOnlyVec { - count: AtomicUsize, - _reserved: AtomicUsize, - data: [UnsafeCell<*mut T>; BITS_USED - 1 - 3], -} - -unsafe impl Send for AppendOnlyVec {} -unsafe impl Sync for AppendOnlyVec {} - -const BITS: usize = std::mem::size_of::() * 8; - -#[cfg(target_arch = "x86_64")] -const BITS_USED: usize = 48; -#[cfg(all(not(target_arch = "x86_64"), target_pointer_width = "64"))] -const BITS_USED: usize = 64; -#[cfg(target_pointer_width = "32")] -const BITS_USED: usize = 32; - -// This takes an index into a vec, and determines which data array will hold it -// (the first return value), and what the index will be into that data array -// (second return value) -// -// The ith data array holds 1< (u32, usize) { - let i = i + 8; - let bin = BITS as u32 - 1 - i.leading_zeros(); - let bin = bin - 3; - let offset = i - bin_size(bin); - (bin, offset) -} - -const fn bin_size(array: u32) -> usize { - (1 << 3) << array -} - -#[test] -fn test_indices() { - for i in 0..32 { - println!("{:3}: {} {}", i, indices(i).0, indices(i).1); - } - let mut array = 0; - let mut offset = 0; - let mut index = 0; - while index < 1000 { - index += 1; - offset += 1; - if offset >= bin_size(array) { - offset = 0; - array += 1; - } - assert_eq!(indices(index), (array, offset)); - } -} - -impl AppendOnlyVec { - const EMPTY: UnsafeCell<*mut T> = UnsafeCell::new(ptr::null_mut()); - - /// Allocate a new empty array. - pub const fn new() -> Self { - AppendOnlyVec { - count: AtomicUsize::new(0), - _reserved: AtomicUsize::new(0), - data: [Self::EMPTY; BITS_USED - 1 - 3], - } - } - - /// Find the length of the array. - #[inline] - pub fn len(&self) -> usize { - self.count.load(Ordering::Acquire) - } - - fn layout(array: u32) -> std::alloc::Layout { - std::alloc::Layout::array::(bin_size(array)).unwrap() - } - - /// Append an element to the array and get a mutable ref to it. - /// - /// This is notable in that it doesn't require a `&mut self`, because it - /// does appropriate atomic synchronization. - pub fn push(&self, val: T) -> &mut T { - let idx = self._reserved.fetch_add(1, Ordering::Relaxed); - let (array, offset) = indices(idx); - let ptr = if self.len() < 1 + idx - offset { - // We are working on a new array, which may not have been allocated... - if offset == 0 { - // It is our job to allocate the array! The size of the array - // is determined in the self.layout method, which needs to be - // consistent with the indices function. - let layout = Self::layout(array); - let ptr = unsafe { std::alloc::alloc(layout) } as *mut T; - unsafe { - *self.data[array as usize].get() = ptr; - } - ptr - } else { - // We need to wait for the array to be allocated. - while self.len() < 1 + idx - offset { - std::hint::spin_loop(); - } - // The Ordering::Acquire semantics of self.len() ensures that - // this pointer read will get the non-null pointer allocated - // above. - unsafe { *self.data[array as usize].get() } - } - } else { - // The Ordering::Acquire semantics of self.len() ensures that - // this pointer read will get the non-null pointer allocated - // above. - unsafe { *self.data[array as usize].get() } - }; - - // The contents of this offset are guaranteed to be unused (so far) - // because we got the idx from our fetch_add above, and ptr is - // guaranteed to be valid because of the loop we used above, which used - // self.len() which has Ordering::Acquire semantics. - unsafe { (ptr.add(offset)).write(val) }; - - // Now we need to increase the size of the vec, so it can get read. We - // use Release upon success, to ensure that the value which we wrote is - // visible to any thread that has confirmed that the count is big enough - // to read that element. In case of failure, we can be relaxed, since - // we don't do anything with the result other than try again. - while self - .count - .compare_exchange(idx, idx + 1, Ordering::Release, Ordering::Relaxed) - .is_err() - { - // This means that someone else *started* pushing before we started, - // but hasn't yet finished. We have to wait for them to finish - // pushing before we can update the count. Note that using a - // spinloop here isn't really ideal, but except when allocating a - // new array, the window between reserving space and using it is - // pretty small, so contention will hopefully be rare, and having a - // context switch during that interval will hopefully be vanishingly - // unlikely. - std::hint::spin_loop(); - } - - unsafe { &mut *ptr } - } - - /// Convert into a standard `Vec`. - pub fn into_vec(self) -> Vec { - let mut vec = Vec::with_capacity(self.len()); - - for idx in 0..self.len() { - let (array, offset) = indices(idx); - // We use a Relaxed load of the pointer, because the loop above (which - // ends before `self.len()`) should ensure that the data we want is - // already visible, since it Acquired `self.count` which synchronizes - // with the write in `self.push`. - let ptr = unsafe { *self.data[array as usize].get() }; - - // Copy the element value. The copy remaining in the array must not - // be used again (i.e. make sure we do not drop it) - let value = unsafe { ptr.add(offset).read() }; - - vec.push(value); - } - - // Prevent dropping the copied-out values by marking the count as 0 before - // our own drop is run - self.count.store(0, Ordering::Relaxed); - - vec - } -} - -impl Default for AppendOnlyVec { - fn default() -> Self { - Self::new() - } -} - -impl Debug for AppendOnlyVec { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("AppendOnlyVec").field("len", &self.len()).finish() - } -} - -impl Drop for AppendOnlyVec { - fn drop(&mut self) { - // First we'll drop all the `T` in a slightly sloppy way. FIXME this - // could be optimized to avoid reloading the `ptr`. - for idx in 0..self.len() { - let (array, offset) = indices(idx); - // We use a Relaxed load of the pointer, because the loop above (which - // ends before `self.len()`) should ensure that the data we want is - // already visible, since it Acquired `self.count` which synchronizes - // with the write in `self.push`. - let ptr = unsafe { *self.data[array as usize].get() }; - unsafe { - ptr::drop_in_place(ptr.add(offset)); - } - } - // Now we will free all the arrays. - for array in 0..self.data.len() as u32 { - // This load is relaxed because no other thread can have a reference - // to Self because we have a &mut self. - let ptr = unsafe { *self.data[array as usize].get() }; - if !ptr.is_null() { - let layout = Self::layout(array); - unsafe { std::alloc::dealloc(ptr as *mut u8, layout) }; - } else { - break; - } - } - } -} - -impl IntoIterator for AppendOnlyVec { - type Item = T; - type IntoIter = std::vec::IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.into_vec().into_iter() - } -} - -#[test] -fn test_parallel_pushing() { - use std::sync::Arc; - let v = Arc::new(AppendOnlyVec::::new()); - let mut threads = Vec::new(); - const N: u64 = 100; - for thread_num in 0..N { - let v = v.clone(); - threads.push(std::thread::spawn(move || { - let which1 = v.push(thread_num); - let which2 = v.push(thread_num); - assert_eq!(*which1, thread_num); - assert_eq!(*which2, thread_num); - })); - } - for t in threads { - t.join().unwrap(); - } - let v = Arc::into_inner(v).unwrap().into_vec(); - for thread_num in 0..N { - assert_eq!(2, v.iter().copied().filter(|&x| x == thread_num).count()); - } -} - -#[test] -fn test_into_vec() { - struct SafeToDrop(bool); - - impl Drop for SafeToDrop { - fn drop(&mut self) { - assert!(self.0); - } - } - - let v = AppendOnlyVec::new(); - - for _ in 0..50 { - v.push(SafeToDrop(false)); - } - - let mut v = v.into_vec(); - assert_eq!(v.len(), 50); - - for i in v.iter_mut() { - i.0 = true; - } -} - -#[test] -fn test_push_then_index_mut() { - let v = AppendOnlyVec::::new(); - for i in 0..1024 { - *v.push(i) += 1; - } - - let v = v.into_vec(); - for i in 0..1024 { - assert_eq!(v[i], 2 * i); - } -} diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 8ffec68f3..f4ad50bfe 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -12,7 +12,7 @@ use super::super::cache::CboCachedSorter; use super::facet_document::extract_document_facets; use super::FacetKind; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::new::append_only_vec::AppendOnlyVec; +use crate::update::new::append_only_linked_list::AppendOnlyLinkedList; use crate::update::new::extract::DocidsExtractor; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; use crate::update::new::DocumentChange; @@ -210,7 +210,7 @@ impl DocidsExtractor for FacetedDocidsExtractor { let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; let attributes_to_extract: Vec<_> = attributes_to_extract.iter().map(|s| s.as_ref()).collect(); - let caches = AppendOnlyVec::new(); + let caches = AppendOnlyLinkedList::new(); { let span = diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index f4346ba52..702b8f4e9 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -9,7 +9,7 @@ use rayon::iter::IntoParallelIterator; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::SearchableExtractor; -use crate::update::new::append_only_vec::AppendOnlyVec; +use crate::update::new::append_only_linked_list::AppendOnlyLinkedList; use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; @@ -341,7 +341,7 @@ impl WordDocidsExtractors { max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - let caches = AppendOnlyVec::new(); + let caches = AppendOnlyLinkedList::new(); { let span = diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index b3fa646b9..ba1d53f54 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -14,7 +14,7 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; use super::DocidsExtractor; -use crate::update::new::append_only_vec::AppendOnlyVec; +use crate::update::new::append_only_linked_list::AppendOnlyLinkedList; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; @@ -58,7 +58,7 @@ pub trait SearchableExtractor { localized_attributes_rules: &localized_attributes_rules, max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - let caches = AppendOnlyVec::new(); + let caches = AppendOnlyLinkedList::new(); { let span = diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 264241caa..862dd4dac 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -4,7 +4,7 @@ pub use top_level_map::{CowStr, TopLevelMap}; use super::del_add::DelAdd; use crate::FieldId; -mod append_only_vec; +mod append_only_linked_list; mod channel; mod document_change; mod extract; From a7a01646cf52f1f70cf687d1a9ea850adbf3b4bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Oct 2024 15:57:31 +0200 Subject: [PATCH 120/247] Remove the useless Manually drop --- .../src/update/new/append_only_linked_list.rs | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/milli/src/update/new/append_only_linked_list.rs b/milli/src/update/new/append_only_linked_list.rs index 88b05c0ec..c08b9c090 100644 --- a/milli/src/update/new/append_only_linked_list.rs +++ b/milli/src/update/new/append_only_linked_list.rs @@ -1,6 +1,5 @@ -use std::fmt; -use std::mem::{self, ManuallyDrop}; use std::sync::atomic::AtomicPtr; +use std::{fmt, mem}; /// An append-only linked-list that returns a mutable references to the pushed items. pub struct AppendOnlyLinkedList { @@ -8,7 +7,7 @@ pub struct AppendOnlyLinkedList { } struct Node { - item: ManuallyDrop, + item: T, parent: AtomicPtr>, } @@ -23,10 +22,7 @@ impl AppendOnlyLinkedList { pub fn push(&self, item: T) -> &mut T { use std::sync::atomic::Ordering::{Relaxed, SeqCst}; - let node = Box::leak(Box::new(Node { - item: ManuallyDrop::new(item), - parent: AtomicPtr::default(), - })); + let node = Box::leak(Box::new(Node { item, parent: AtomicPtr::default() })); let mut head = self.head.load(SeqCst); loop { @@ -82,13 +78,10 @@ impl Iterator for IntoIter { if ptr.is_null() { None } else { - let mut node = unsafe { Box::from_raw(ptr) }; + let node = unsafe { Box::from_raw(ptr) }; // Let's set the next node to read to be the parent of this one self.0 = node.parent; - // ...and take the item from the Node before it is dropped - let item = unsafe { ManuallyDrop::take(&mut node.item) }; - Some(item) - // ...then drop the Node itself + Some(node.item) } } } @@ -100,9 +93,6 @@ impl Drop for IntoIter { let mut node = unsafe { Box::from_raw(ptr) }; // Let's set the next node to read to be the parent of this one ptr = *node.parent.get_mut(); - // ...and drop the item ourselves. - unsafe { ManuallyDrop::drop(&mut node.item) } - // ...then drop the Node itself } } } From 4665bfcb195182c40cdaf8f364d0d86fe1f19fbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Oct 2024 16:14:23 +0200 Subject: [PATCH 121/247] Move the parent assignation before the exchange operation --- milli/src/update/new/append_only_linked_list.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/milli/src/update/new/append_only_linked_list.rs b/milli/src/update/new/append_only_linked_list.rs index c08b9c090..697d5583f 100644 --- a/milli/src/update/new/append_only_linked_list.rs +++ b/milli/src/update/new/append_only_linked_list.rs @@ -23,15 +23,13 @@ impl AppendOnlyLinkedList { use std::sync::atomic::Ordering::{Relaxed, SeqCst}; let node = Box::leak(Box::new(Node { item, parent: AtomicPtr::default() })); - let mut head = self.head.load(SeqCst); + loop { std::hint::spin_loop(); + node.parent = AtomicPtr::new(head); match self.head.compare_exchange_weak(head, node, SeqCst, Relaxed) { - Ok(parent) => { - node.parent = AtomicPtr::new(parent); - break; - } + Ok(_) => break, Err(new) => head = new, } } From 58d96fbea3cfedb1155668d3180582cc65fcc264 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Oct 2024 16:15:05 +0200 Subject: [PATCH 122/247] Rename Node parent to next --- milli/src/update/new/append_only_linked_list.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/milli/src/update/new/append_only_linked_list.rs b/milli/src/update/new/append_only_linked_list.rs index 697d5583f..274d3eea4 100644 --- a/milli/src/update/new/append_only_linked_list.rs +++ b/milli/src/update/new/append_only_linked_list.rs @@ -8,7 +8,7 @@ pub struct AppendOnlyLinkedList { struct Node { item: T, - parent: AtomicPtr>, + next: AtomicPtr>, } impl AppendOnlyLinkedList { @@ -22,12 +22,12 @@ impl AppendOnlyLinkedList { pub fn push(&self, item: T) -> &mut T { use std::sync::atomic::Ordering::{Relaxed, SeqCst}; - let node = Box::leak(Box::new(Node { item, parent: AtomicPtr::default() })); + let node = Box::leak(Box::new(Node { item, next: AtomicPtr::default() })); let mut head = self.head.load(SeqCst); loop { std::hint::spin_loop(); - node.parent = AtomicPtr::new(head); + node.next = AtomicPtr::new(head); match self.head.compare_exchange_weak(head, node, SeqCst, Relaxed) { Ok(_) => break, Err(new) => head = new, @@ -77,8 +77,8 @@ impl Iterator for IntoIter { None } else { let node = unsafe { Box::from_raw(ptr) }; - // Let's set the next node to read to be the parent of this one - self.0 = node.parent; + // Let's set the next node to read to be the next of this one + self.0 = node.next; Some(node.item) } } @@ -89,8 +89,8 @@ impl Drop for IntoIter { let mut ptr = *self.0.get_mut(); while !ptr.is_null() { let mut node = unsafe { Box::from_raw(ptr) }; - // Let's set the next node to read to be the parent of this one - ptr = *node.parent.get_mut(); + // Let's set the next node to read to be the next of this one + ptr = *node.next.get_mut(); } } } From c11b7e5c0f6d6e2e6abe26006b72ea934397d52a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 7 Oct 2024 15:58:16 +0200 Subject: [PATCH 123/247] Reduce number of cache created by using thread_local --- Cargo.lock | 5 ++- milli/Cargo.toml | 1 + .../new/extract/faceted/extract_facets.rs | 36 ++++++++++------- .../extract/searchable/extract_word_docids.rs | 27 ++++++++----- .../src/update/new/extract/searchable/mod.rs | 40 ++++++++++++------- 5 files changed, 68 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 06bd9c234..335445956 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3598,6 +3598,7 @@ dependencies = [ "smartstring", "tempfile", "thiserror", + "thread_local", "tiktoken-rs", "time", "tokenizers", @@ -5332,9 +5333,9 @@ dependencies = [ [[package]] name = "thread_local" -version = "1.1.7" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" dependencies = [ "cfg-if", "once_cell", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index bae3dd64b..72f3daa4e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -89,6 +89,7 @@ ureq = { version = "2.10.0", features = ["json"] } url = "2.5.2" rayon-par-bridge = "0.1.0" hashbrown = "0.14.5" +thread_local = "1.1.8" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index f4ad50bfe..8e8b71676 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -1,3 +1,4 @@ +use std::cell::RefCell; use std::collections::HashSet; use std::fmt::Debug; use std::fs::File; @@ -7,6 +8,7 @@ use grenad::{MergeFunction, Merger}; use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; use serde_json::Value; +use thread_local::ThreadLocal; use super::super::cache::CboCachedSorter; use super::facet_document::extract_document_facets; @@ -216,24 +218,28 @@ impl DocidsExtractor for FacetedDocidsExtractor { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); + let local = ThreadLocal::new(); document_changes.into_par_iter().try_arc_for_each_try_init( || { - let rtxn = index.read_txn().map_err(Error::from)?; - let cache = caches.push(CboCachedSorter::new( - // TODO use a better value - 100.try_into().unwrap(), - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ), - )); - Ok((rtxn, fields_ids_map.clone(), Vec::new(), cache)) + local.get_or_try(|| { + let rtxn = index.read_txn().map_err(Error::from)?; + let cache = caches.push(CboCachedSorter::new( + /// TODO use a better value + 100.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + )); + Ok((rtxn, RefCell::new((fields_ids_map.clone(), Vec::new(), cache)))) + }) }, - |(rtxn, fields_ids_map, buffer, cached_sorter), document_change| { + |(rtxn, rc), document_change| { + let (fields_ids_map, buffer, cached_sorter) = &mut *rc.borrow_mut(); Self::extract_document_change( rtxn, index, diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 702b8f4e9..df8409618 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -1,3 +1,4 @@ +use std::cell::RefCell; use std::collections::HashMap; use std::fs::File; use std::num::NonZero; @@ -6,6 +7,7 @@ use std::sync::Arc; use grenad::{Merger, MergerBuilder}; use heed::RoTxn; use rayon::iter::IntoParallelIterator; +use thread_local::ThreadLocal; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::SearchableExtractor; @@ -347,18 +349,23 @@ impl WordDocidsExtractors { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); + let local = ThreadLocal::new(); document_changes.into_par_iter().try_arc_for_each_try_init( || { - let rtxn = index.read_txn().map_err(Error::from)?; - let cache = caches.push(WordDocidsCachedSorters::new( - indexer, - max_memory, - // TODO use a better value - 200_000.try_into().unwrap(), - )); - Ok((rtxn, &document_tokenizer, fields_ids_map.clone(), cache)) + local.get_or_try(|| { + let rtxn = index.read_txn().map_err(Error::from)?; + let fields_ids_map = fields_ids_map.clone(); + let cache = caches.push(WordDocidsCachedSorters::new( + indexer, + max_memory, + // TODO use a better value + 200_000.try_into().unwrap(), + )); + Ok((rtxn, &document_tokenizer, RefCell::new((fields_ids_map, cache)))) + }) }, - |(rtxn, document_tokenizer, fields_ids_map, cached_sorter), document_change| { + |(rtxn, document_tokenizer, rc), document_change| { + let (fields_ids_map, cached_sorter) = &mut *rc.borrow_mut(); Self::extract_document_change( rtxn, index, @@ -377,7 +384,9 @@ impl WordDocidsExtractors { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); let mut builder = WordDocidsMergerBuilders::new(); + let mut count = 0; for cache in caches.into_iter() { + count += 1; builder.add_sorters(cache)?; } diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index ba1d53f54..272bff4d3 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -2,6 +2,7 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod tokenize_document; +use std::cell::RefCell; use std::fs::File; use std::sync::Arc; @@ -10,6 +11,7 @@ pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; use grenad::Merger; use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; +use thread_local::ThreadLocal; use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; @@ -64,24 +66,32 @@ pub trait SearchableExtractor { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); + let local = ThreadLocal::new(); document_changes.into_par_iter().try_arc_for_each_try_init( || { - let rtxn = index.read_txn().map_err(Error::from)?; - let cache = caches.push(CboCachedSorter::new( - // TODO use a better value - 1_000_000.try_into().unwrap(), - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ), - )); - Ok((rtxn, &document_tokenizer, fields_ids_map.clone(), cache)) + local.get_or_try(|| { + let rtxn = index.read_txn().map_err(Error::from)?; + let cache = caches.push(CboCachedSorter::new( + /// TODO use a better value + 1_000_000.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ), + )); + Ok(( + rtxn, + &document_tokenizer, + RefCell::new((fields_ids_map.clone(), cache)), + )) + }) }, - |(rtxn, document_tokenizer, fields_ids_map, cached_sorter), document_change| { + |(rtxn, document_tokenizer, rc), document_change| { + let (fields_ids_map, cached_sorter) = &mut *rc.borrow_mut(); Self::extract_document_change( rtxn, index, From 83c09d0db0f418ad84c69c9a6ae582452708cdc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 7 Oct 2024 16:38:45 +0200 Subject: [PATCH 124/247] Remove the now, useless AppendOnlyVec library --- .../src/update/new/append_only_linked_list.rs | 158 ------------------ .../new/extract/faceted/extract_facets.rs | 15 +- .../extract/searchable/extract_word_docids.rs | 15 +- .../src/update/new/extract/searchable/mod.rs | 16 +- milli/src/update/new/mod.rs | 1 - 5 files changed, 21 insertions(+), 184 deletions(-) delete mode 100644 milli/src/update/new/append_only_linked_list.rs diff --git a/milli/src/update/new/append_only_linked_list.rs b/milli/src/update/new/append_only_linked_list.rs deleted file mode 100644 index 274d3eea4..000000000 --- a/milli/src/update/new/append_only_linked_list.rs +++ /dev/null @@ -1,158 +0,0 @@ -use std::sync::atomic::AtomicPtr; -use std::{fmt, mem}; - -/// An append-only linked-list that returns a mutable references to the pushed items. -pub struct AppendOnlyLinkedList { - head: AtomicPtr>, -} - -struct Node { - item: T, - next: AtomicPtr>, -} - -impl AppendOnlyLinkedList { - /// Creates an empty list. - pub fn new() -> AppendOnlyLinkedList { - AppendOnlyLinkedList { head: AtomicPtr::default() } - } - - /// Pushes the item at the front of the linked-list and returns a unique and mutable reference to it. - #[allow(clippy::mut_from_ref)] // the mut ref is derived from T and unique each time - pub fn push(&self, item: T) -> &mut T { - use std::sync::atomic::Ordering::{Relaxed, SeqCst}; - - let node = Box::leak(Box::new(Node { item, next: AtomicPtr::default() })); - let mut head = self.head.load(SeqCst); - - loop { - std::hint::spin_loop(); - node.next = AtomicPtr::new(head); - match self.head.compare_exchange_weak(head, node, SeqCst, Relaxed) { - Ok(_) => break, - Err(new) => head = new, - } - } - - &mut node.item - } -} - -impl Default for AppendOnlyLinkedList { - fn default() -> Self { - Self::new() - } -} - -impl Drop for AppendOnlyLinkedList { - fn drop(&mut self) { - // Let's use the drop implementation of the IntoIter struct - IntoIter(mem::take(&mut self.head)); - } -} - -impl fmt::Debug for AppendOnlyLinkedList { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("AppendOnlyLinkedList").finish() - } -} - -impl IntoIterator for AppendOnlyLinkedList { - type Item = T; - type IntoIter = IntoIter; - - fn into_iter(mut self) -> Self::IntoIter { - IntoIter(mem::take(&mut self.head)) - } -} - -pub struct IntoIter(AtomicPtr>); - -impl Iterator for IntoIter { - type Item = T; - - fn next(&mut self) -> Option { - let ptr = *self.0.get_mut(); - if ptr.is_null() { - None - } else { - let node = unsafe { Box::from_raw(ptr) }; - // Let's set the next node to read to be the next of this one - self.0 = node.next; - Some(node.item) - } - } -} - -impl Drop for IntoIter { - fn drop(&mut self) { - let mut ptr = *self.0.get_mut(); - while !ptr.is_null() { - let mut node = unsafe { Box::from_raw(ptr) }; - // Let's set the next node to read to be the next of this one - ptr = *node.next.get_mut(); - } - } -} - -#[test] -fn test_parallel_pushing() { - use std::sync::Arc; - let v = Arc::new(AppendOnlyLinkedList::::new()); - let mut threads = Vec::new(); - const N: u64 = 100; - for thread_num in 0..N { - let v = v.clone(); - threads.push(std::thread::spawn(move || { - let which1 = v.push(thread_num); - let which2 = v.push(thread_num); - assert_eq!(*which1, thread_num); - assert_eq!(*which2, thread_num); - })); - } - for t in threads { - t.join().unwrap(); - } - let v = Arc::into_inner(v).unwrap().into_iter().collect::>(); - for thread_num in (0..N).rev() { - assert_eq!(2, v.iter().copied().filter(|&x| x == thread_num).count()); - } -} - -#[test] -fn test_into_vec() { - struct SafeToDrop(bool); - - impl Drop for SafeToDrop { - fn drop(&mut self) { - assert!(self.0); - } - } - - let v = AppendOnlyLinkedList::new(); - - for _ in 0..50 { - v.push(SafeToDrop(false)); - } - - let mut v = v.into_iter().collect::>(); - assert_eq!(v.len(), 50); - - for i in v.iter_mut() { - i.0 = true; - } -} - -#[test] -fn test_push_then_index_mut() { - let v = AppendOnlyLinkedList::::new(); - let mut w = Vec::new(); - for i in 0..1024 { - *v.push(i) += 1; - w.push(i + 1); - } - - let mut v = v.into_iter().collect::>(); - v.reverse(); - assert_eq!(v, w); -} diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 8e8b71676..e6c3b02e6 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -14,7 +14,6 @@ use super::super::cache::CboCachedSorter; use super::facet_document::extract_document_facets; use super::FacetKind; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::new::append_only_linked_list::AppendOnlyLinkedList; use crate::update::new::extract::DocidsExtractor; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; use crate::update::new::DocumentChange; @@ -212,18 +211,17 @@ impl DocidsExtractor for FacetedDocidsExtractor { let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; let attributes_to_extract: Vec<_> = attributes_to_extract.iter().map(|s| s.as_ref()).collect(); - let caches = AppendOnlyLinkedList::new(); + let thread_local = ThreadLocal::new(); { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - let local = ThreadLocal::new(); document_changes.into_par_iter().try_arc_for_each_try_init( || { - local.get_or_try(|| { + thread_local.get_or_try(|| { let rtxn = index.read_txn().map_err(Error::from)?; - let cache = caches.push(CboCachedSorter::new( + let cache = CboCachedSorter::new( /// TODO use a better value 100.try_into().unwrap(), create_sorter( @@ -234,7 +232,7 @@ impl DocidsExtractor for FacetedDocidsExtractor { indexer.max_nb_chunks, max_memory, ), - )); + ); Ok((rtxn, RefCell::new((fields_ids_map.clone(), Vec::new(), cache)))) }) }, @@ -259,10 +257,11 @@ impl DocidsExtractor for FacetedDocidsExtractor { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); - let readers: Vec<_> = caches + let readers: Vec<_> = thread_local .into_iter() .par_bridge() - .map(|cached_sorter| { + .map(|(_, rc)| { + let (_, _, cached_sorter) = rc.into_inner(); let sorter = cached_sorter.into_sorter()?; sorter.into_reader_cursors() }) diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index df8409618..6da793276 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -11,7 +11,6 @@ use thread_local::ThreadLocal; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::SearchableExtractor; -use crate::update::new::append_only_linked_list::AppendOnlyLinkedList; use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; @@ -343,24 +342,23 @@ impl WordDocidsExtractors { max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - let caches = AppendOnlyLinkedList::new(); + let thread_local = ThreadLocal::new(); { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - let local = ThreadLocal::new(); document_changes.into_par_iter().try_arc_for_each_try_init( || { - local.get_or_try(|| { + thread_local.get_or_try(|| { let rtxn = index.read_txn().map_err(Error::from)?; let fields_ids_map = fields_ids_map.clone(); - let cache = caches.push(WordDocidsCachedSorters::new( + let cache = WordDocidsCachedSorters::new( indexer, max_memory, // TODO use a better value 200_000.try_into().unwrap(), - )); + ); Ok((rtxn, &document_tokenizer, RefCell::new((fields_ids_map, cache)))) }) }, @@ -384,9 +382,8 @@ impl WordDocidsExtractors { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); let mut builder = WordDocidsMergerBuilders::new(); - let mut count = 0; - for cache in caches.into_iter() { - count += 1; + for (_, _, rc) in thread_local.into_iter() { + let (_, cache) = rc.into_inner(); builder.add_sorters(cache)?; } diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 272bff4d3..25f1eda14 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -16,7 +16,6 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; use super::DocidsExtractor; -use crate::update::new::append_only_linked_list::AppendOnlyLinkedList; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; @@ -60,18 +59,18 @@ pub trait SearchableExtractor { localized_attributes_rules: &localized_attributes_rules, max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - let caches = AppendOnlyLinkedList::new(); + + let thread_local = ThreadLocal::new(); { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - let local = ThreadLocal::new(); document_changes.into_par_iter().try_arc_for_each_try_init( || { - local.get_or_try(|| { + thread_local.get_or_try(|| { let rtxn = index.read_txn().map_err(Error::from)?; - let cache = caches.push(CboCachedSorter::new( + let cache = CboCachedSorter::new( /// TODO use a better value 1_000_000.try_into().unwrap(), create_sorter( @@ -82,7 +81,7 @@ pub trait SearchableExtractor { indexer.max_nb_chunks, max_memory, ), - )); + ); Ok(( rtxn, &document_tokenizer, @@ -110,10 +109,11 @@ pub trait SearchableExtractor { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); - let readers: Vec<_> = caches + let readers: Vec<_> = thread_local .into_iter() .par_bridge() - .map(|cached_sorter| { + .map(|(_, _, rc)| { + let (_, cached_sorter) = rc.into_inner(); let sorter = cached_sorter.into_sorter()?; sorter.into_reader_cursors() }) diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 862dd4dac..4a83529dc 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -4,7 +4,6 @@ pub use top_level_map::{CowStr, TopLevelMap}; use super::del_add::DelAdd; use crate::FieldId; -mod append_only_linked_list; mod channel; mod document_change; mod extract; From eb09dfed04e377bf44cafe132458004af8222158 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 7 Oct 2024 16:41:17 +0200 Subject: [PATCH 125/247] Avoid reallocation with the ThreadLocal pool --- milli/src/update/new/extract/faceted/extract_facets.rs | 2 +- milli/src/update/new/extract/searchable/extract_word_docids.rs | 2 +- milli/src/update/new/extract/searchable/mod.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index e6c3b02e6..8ca9a8b20 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -211,7 +211,7 @@ impl DocidsExtractor for FacetedDocidsExtractor { let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; let attributes_to_extract: Vec<_> = attributes_to_extract.iter().map(|s| s.as_ref()).collect(); - let thread_local = ThreadLocal::new(); + let thread_local = ThreadLocal::with_capacity(rayon::current_num_threads()); { let span = diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 6da793276..dde969614 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -342,7 +342,7 @@ impl WordDocidsExtractors { max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - let thread_local = ThreadLocal::new(); + let thread_local = ThreadLocal::with_capacity(rayon::current_num_threads()); { let span = diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 25f1eda14..a261efda3 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -60,7 +60,7 @@ pub trait SearchableExtractor { max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - let thread_local = ThreadLocal::new(); + let thread_local = ThreadLocal::with_capacity(rayon::current_num_threads()); { let span = From 470c2272ddb288affaf1403cecdb87fed75e86cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 8 Oct 2024 15:29:24 +0200 Subject: [PATCH 126/247] Show much more stats about the LRU caches --- Cargo.lock | 2 +- milli/src/update/new/extract/cache.rs | 145 +++++++++++++++++--------- 2 files changed, 98 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 335445956..ed62f0716 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4655,7 +4655,7 @@ dependencies = [ [[package]] name = "roaring" version = "0.10.6" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#6bba84b1a47da1d6e52d5c4dc0ce8593ae4646a5" +source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#8ff028e484fb6192a0acf5a669eaf18c30cada6e" dependencies = [ "bytemuck", "byteorder", diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 1b7a58472..2fbe427f3 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -1,7 +1,9 @@ +use std::fmt::Write as _; use std::mem; use std::num::NonZeroUsize; use grenad::{MergeFunction, Sorter}; +use roaring::bitmap::Statistics; use roaring::RoaringBitmap; use smallvec::SmallVec; @@ -38,7 +40,7 @@ impl CboCachedSorter { pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del, add: _ }) => { - del.get_or_insert_with(PushOptimizedBitmap::default).insert(n); + del.get_or_insert_with(RoaringBitmap::default).insert(n); } None => { self.total_insertions += 1; @@ -60,7 +62,7 @@ impl CboCachedSorter { ) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del, add: _ }) => { - del.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap); + *del.get_or_insert_with(RoaringBitmap::default) |= bitmap; } None => { self.total_insertions += 1; @@ -78,7 +80,7 @@ impl CboCachedSorter { pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del: _, add }) => { - add.get_or_insert_with(PushOptimizedBitmap::default).insert(n); + add.get_or_insert_with(RoaringBitmap::default).insert(n); } None => { self.total_insertions += 1; @@ -100,7 +102,7 @@ impl CboCachedSorter { ) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del: _, add }) => { - add.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap); + *add.get_or_insert_with(RoaringBitmap::default) |= bitmap; } None => { self.total_insertions += 1; @@ -118,8 +120,8 @@ impl CboCachedSorter { pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { match self.cache.get_mut(key) { Some(DelAddRoaringBitmap { del, add }) => { - del.get_or_insert_with(PushOptimizedBitmap::default).insert(n); - add.get_or_insert_with(PushOptimizedBitmap::default).insert(n); + del.get_or_insert_with(RoaringBitmap::default).insert(n); + add.get_or_insert_with(RoaringBitmap::default).insert(n); } None => { self.total_insertions += 1; @@ -145,21 +147,21 @@ impl CboCachedSorter { match deladd { DelAddRoaringBitmap { del: Some(del), add: None } => { self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del.bitmap, &mut self.cbo_buffer); + CboRoaringBitmapCodec::serialize_into(&del, &mut self.cbo_buffer); value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: Some(add) } => { self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add.bitmap, &mut self.cbo_buffer); + CboRoaringBitmapCodec::serialize_into(&add, &mut self.cbo_buffer); value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?; } DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del.bitmap, &mut self.cbo_buffer); + CboRoaringBitmapCodec::serialize_into(&del, &mut self.cbo_buffer); value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?; self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add.bitmap, &mut self.cbo_buffer); + CboRoaringBitmapCodec::serialize_into(&add, &mut self.cbo_buffer); value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: None } => return Ok(()), @@ -173,78 +175,125 @@ impl CboCachedSorter { } pub fn into_sorter(mut self) -> grenad::Result, MF::Error> { + let mut all_n_containers = Vec::new(); + let mut all_n_array_containers = Vec::new(); + let mut all_n_bitset_containers = Vec::new(); + let mut all_n_values_array_containers = Vec::new(); + let mut all_n_values_bitset_containers = Vec::new(); + let mut all_cardinality = Vec::new(); + let default_arc = Lru::new(NonZeroUsize::MIN); for (key, deladd) in mem::replace(&mut self.cache, default_arc) { + for bitmap in [&deladd.del, &deladd.add].into_iter().flatten() { + let Statistics { + n_containers, + n_array_containers, + n_bitset_containers, + n_values_array_containers, + n_values_bitset_containers, + cardinality, + .. + } = bitmap.statistics(); + all_n_containers.push(n_containers); + all_n_array_containers.push(n_array_containers); + all_n_bitset_containers.push(n_bitset_containers); + all_n_values_array_containers.push(n_values_array_containers); + all_n_values_bitset_containers.push(n_values_bitset_containers); + all_cardinality.push(cardinality as u32); + } + self.write_entry(key, deladd)?; } - eprintln!( + let mut output = String::new(); + + for (name, mut slice) in [ + ("n_containers", all_n_containers), + ("n_array_containers", all_n_array_containers), + ("n_bitset_containers", all_n_bitset_containers), + ("n_values_array_containers", all_n_values_array_containers), + ("n_values_bitset_containers", all_n_values_bitset_containers), + ("cardinality", all_cardinality), + ] { + let _ = writeln!(&mut output, "{name} (p100) {:?}", Stats::from_slice(&mut slice)); + // let _ = writeln!(&mut output, "{name} (p99) {:?}", Stats::from_slice_p99(&mut slice)); + } + + let _ = writeln!( + &mut output, "LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions", self.fitted_in_key, (self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0, self.total_insertions, ); + eprintln!("{output}"); + Ok(self.sorter) } } +#[derive(Default, Debug)] +struct Stats { + pub len: usize, + pub average: f32, + pub mean: u32, + pub min: u32, + pub max: u32, +} + +impl Stats { + fn from_slice(slice: &mut [u32]) -> Stats { + slice.sort_unstable(); + Self::from_sorted_slice(slice) + } + + fn from_slice_p99(slice: &mut [u32]) -> Stats { + slice.sort_unstable(); + let new_len = slice.len() - (slice.len() as f32 / 100.0) as usize; + match slice.get(..new_len) { + Some(slice) => Self::from_sorted_slice(slice), + None => Stats::default(), + } + } + + fn from_sorted_slice(slice: &[u32]) -> Stats { + let sum: f64 = slice.iter().map(|i| *i as f64).sum(); + let average = (sum / slice.len() as f64) as f32; + let mean = *slice.len().checked_div(2).and_then(|middle| slice.get(middle)).unwrap_or(&0); + let min = *slice.first().unwrap_or(&0); + let max = *slice.last().unwrap_or(&0); + Stats { len: slice.len(), average, mean, min, max } + } +} + #[derive(Debug, Clone)] pub struct DelAddRoaringBitmap { - pub(crate) del: Option, - pub(crate) add: Option, + pub(crate) del: Option, + pub(crate) add: Option, } impl DelAddRoaringBitmap { fn new_del_add_u32(n: u32) -> Self { DelAddRoaringBitmap { - del: Some(PushOptimizedBitmap::from_single(n)), - add: Some(PushOptimizedBitmap::from_single(n)), + del: Some(RoaringBitmap::from([n])), + add: Some(RoaringBitmap::from([n])), } } fn new_del(bitmap: RoaringBitmap) -> Self { - DelAddRoaringBitmap { del: Some(PushOptimizedBitmap::from_bitmap(bitmap)), add: None } + DelAddRoaringBitmap { del: Some(bitmap), add: None } } fn new_del_u32(n: u32) -> Self { - DelAddRoaringBitmap { del: Some(PushOptimizedBitmap::from_single(n)), add: None } + DelAddRoaringBitmap { del: Some(RoaringBitmap::from([n])), add: None } } fn new_add(bitmap: RoaringBitmap) -> Self { - DelAddRoaringBitmap { del: None, add: Some(PushOptimizedBitmap::from_bitmap(bitmap)) } + DelAddRoaringBitmap { del: None, add: Some(bitmap) } } fn new_add_u32(n: u32) -> Self { - DelAddRoaringBitmap { del: None, add: Some(PushOptimizedBitmap::from_single(n)) } - } -} - -#[derive(Debug, Clone, Default)] -struct PushOptimizedBitmap { - bitmap: RoaringBitmap, -} - -impl PushOptimizedBitmap { - #[inline] - fn from_bitmap(bitmap: RoaringBitmap) -> PushOptimizedBitmap { - PushOptimizedBitmap { bitmap } - } - - #[inline] - fn from_single(single: u32) -> PushOptimizedBitmap { - PushOptimizedBitmap { bitmap: RoaringBitmap::from([single]) } - } - - #[inline] - fn insert(&mut self, n: u32) { - if !self.bitmap.push(n) { - self.bitmap.insert(n); - } - } - - #[inline] - fn union_with_bitmap(&mut self, bitmap: RoaringBitmap) { - self.bitmap |= bitmap; + DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) } } } From 39b27e42be3dde17231360e21ba07b255a07c58d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 8 Oct 2024 16:04:19 +0200 Subject: [PATCH 127/247] Plug the deletion pipeline --- index-scheduler/src/batch.rs | 43 +++++++++++-------- .../update/new/indexer/document_deletion.rs | 2 +- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 28cb8a9e6..69eb28372 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -1500,26 +1500,35 @@ impl IndexScheduler { } } - let config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }; + let rtxn = index.read_txn()?; + let mut fields_ids_map = index.fields_ids_map(&rtxn)?; - let must_stop_processing = self.must_stop_processing.clone(); - let mut builder = milli::update::IndexDocuments::new( - index_wtxn, - index, - self.index_mapper.indexer_config(), - config, - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.get(), - )?; + let primary_key = + retrieve_or_guess_primary_key(&rtxn, index, &mut fields_ids_map, None)? + .unwrap(); - let (new_builder, _count) = - builder.remove_documents_from_db_no_batch(&to_delete)?; - builder = new_builder; + if !tasks.iter().all(|res| res.error.is_some()) { + /// TODO create a pool if needed + // let pool = indexer_config.thread_pool.unwrap(); + let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); - let _ = builder.execute()?; + let param = (index, &fields_ids_map, &primary_key); + let mut indexer = indexer::DocumentDeletion::new(); + indexer.delete_documents_by_docids(to_delete); + /// TODO remove this fields-ids-map, it's useless for the deletion pipeline (the &mut cloned one). + let document_changes = + indexer.document_changes(&mut fields_ids_map.clone(), param)?; + /// TODO pass/write the FieldsIdsMap + indexer::index( + index_wtxn, + index, + fields_ids_map.clone(), + &pool, + document_changes, + )?; + + // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); + } Ok(tasks) } diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index 21d7635c9..9dbc4e52d 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -43,7 +43,7 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion { |rtxn, docid| { let current = index.document(rtxn, docid)?; let external_document_id = primary_key - .document_id(¤t, fields_ids_map)? + .document_id(current, fields_ids_map)? .map_err(|_| InternalError::DatabaseMissingEntry { db_name: EXTERNAL_DOCUMENTS_IDS, key: None, From 68a2502388f68ac6dff1c7c58fd310192ba96674 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Oct 2024 18:08:09 +0200 Subject: [PATCH 128/247] Introduce indexer level bumpalo --- Cargo.lock | 46 ++- index-scheduler/Cargo.toml | 1 + index-scheduler/src/batch.rs | 61 ++- milli/Cargo.toml | 14 +- milli/src/documents/mod.rs | 8 +- milli/src/documents/primary_key.rs | 19 +- milli/src/fields_ids_map.rs | 14 + milli/src/fields_ids_map/global.rs | 24 +- milli/src/update/new/document.rs | 255 ++++++++++++ milli/src/update/new/document_change.rs | 116 ++++-- .../new/extract/faceted/extract_facets.rs | 170 ++++---- .../new/extract/faceted/facet_document.rs | 19 +- milli/src/update/new/extract/mod.rs | 20 +- .../extract/searchable/extract_word_docids.rs | 131 +++--- .../extract_word_pair_proximity_docids.rs | 55 ++- .../src/update/new/extract/searchable/mod.rs | 171 ++++---- .../extract/searchable/tokenize_document.rs | 83 ++-- milli/src/update/new/indexer/de.rs | 163 ++++++++ .../update/new/indexer/document_changes.rs | 378 ++++++++++++++++++ .../update/new/indexer/document_deletion.rs | 197 +++++++-- .../update/new/indexer/document_operation.rs | 369 +++++++++-------- milli/src/update/new/indexer/mod.rs | 172 +++++--- milli/src/update/new/indexer/partial_dump.rs | 150 +++---- .../update/new/indexer/update_by_function.rs | 46 ++- milli/src/update/new/merger.rs | 22 +- milli/src/update/new/mod.rs | 1 + 26 files changed, 1984 insertions(+), 721 deletions(-) create mode 100644 milli/src/update/new/document.rs create mode 100644 milli/src/update/new/indexer/de.rs create mode 100644 milli/src/update/new/indexer/document_changes.rs diff --git a/Cargo.lock b/Cargo.lock index ed62f0716..961ebab28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -296,9 +296,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.16" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" +checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" [[package]] name = "anes" @@ -664,6 +664,10 @@ name = "bumpalo" version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +dependencies = [ + "allocator-api2", + "serde", +] [[package]] name = "byte-unit" @@ -1887,6 +1891,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -2315,6 +2325,18 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", + "serde", +] + [[package]] name = "heapless" version = "0.8.0" @@ -2557,6 +2579,7 @@ dependencies = [ "arroy", "big_s", "bincode", + "bumpalo", "crossbeam", "csv", "derive_builder 0.20.0", @@ -3549,6 +3572,7 @@ dependencies = [ "bimap", "bincode", "bstr", + "bumpalo", "bytemuck", "byteorder", "candle-core", @@ -3585,6 +3609,7 @@ dependencies = [ "once_cell", "ordered-float", "rand", + "raw-collections", "rayon", "rayon-par-bridge", "rhai", @@ -4406,6 +4431,18 @@ dependencies = [ "rand", ] +[[package]] +name = "raw-collections" +version = "0.1.0" +source = "git+https://github.com/dureuill/raw-collections.git#0ecd143c1707d237e3c4d749bc685418da2fccc2" +dependencies = [ + "allocator-api2", + "bumpalo", + "hashbrown 0.15.0", + "serde", + "serde_json", +] + [[package]] name = "raw-cpuid" version = "10.7.0" @@ -4869,12 +4906,13 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.120" +version = "1.0.128" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" dependencies = [ "indexmap", "itoa", + "memchr", "ryu", "serde", ] diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 4731be68b..88f9488b5 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -39,6 +39,7 @@ time = { version = "0.3.36", features = [ tracing = "0.1.40" ureq = "2.10.0" uuid = { version = "1.10.0", features = ["serde", "v4"] } +bumpalo = "3.16.0" [dev-dependencies] arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 69eb28372..446efd0c4 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -23,14 +23,15 @@ use std::fmt; use std::fs::{self, File}; use std::io::BufWriter; +use bumpalo::collections::CollectIn; +use bumpalo::Bump; use dump::IndexMetadata; use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::heed::CompactionOption; -use meilisearch_types::milli::update::new::indexer::{ - self, retrieve_or_guess_primary_key, DocumentChanges, -}; +use meilisearch_types::milli::update::new::indexer::document_changes::DocumentChanges; +use meilisearch_types::milli::update::new::indexer::{self, retrieve_or_guess_primary_key}; use meilisearch_types::milli::update::{ IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; @@ -1219,6 +1220,8 @@ impl IndexScheduler { index: &'i Index, operation: IndexOperation, ) -> Result> { + let indexer_alloc = Bump::new(); + match operation { IndexOperation::DocumentClear { mut tasks, .. } => { let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?; @@ -1252,6 +1255,9 @@ impl IndexScheduler { let mut primary_key_has_been_set = false; let must_stop_processing = self.must_stop_processing.clone(); let indexer_config = self.index_mapper.indexer_config(); + // TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches. + // this is made difficult by the fact we're doing private clones of the index scheduler and sending it + // to a fresh thread. /// TODO manage errors correctly let rtxn = index.read_txn()?; @@ -1274,7 +1280,9 @@ impl IndexScheduler { } } - let mut fields_ids_map = index.fields_ids_map(&rtxn)?; + let db_fields_ids_map = index.fields_ids_map(&rtxn)?; + let mut new_fields_ids_map = db_fields_ids_map.clone(); + let first_document = match content_files.first() { Some(mmap) => { let mut iter = serde_json::Deserializer::from_slice(mmap).into_iter(); @@ -1286,7 +1294,7 @@ impl IndexScheduler { let primary_key = retrieve_or_guess_primary_key( &rtxn, index, - &mut fields_ids_map, + &mut new_fields_ids_map, first_document.as_ref(), )? .unwrap(); @@ -1320,7 +1328,11 @@ impl IndexScheduler { } DocumentOperation::Delete(document_ids) => { let count = document_ids.len(); - indexer.delete_documents(document_ids); + let document_ids: bumpalo::collections::vec::Vec<_> = document_ids + .iter() + .map(|s| &*indexer_alloc.alloc_str(s)) + .collect_in(&indexer_alloc); + indexer.delete_documents(document_ids.into_bump_slice()); // Uses Invariant: remove documents actually always returns Ok for the inner result // let count = user_result.unwrap(); let provided_ids = @@ -1347,10 +1359,22 @@ impl IndexScheduler { // let pool = indexer_config.thread_pool.unwrap(); let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); - let param = (index, &rtxn, &primary_key); - let document_changes = indexer.document_changes(&mut fields_ids_map, param)?; - /// TODO pass/write the FieldsIdsMap - indexer::index(index_wtxn, index, fields_ids_map, &pool, document_changes)?; + let document_changes = indexer.into_changes( + &indexer_alloc, + index, + &rtxn, + &primary_key, + &mut new_fields_ids_map, + )?; + + indexer::index( + index_wtxn, + index, + &db_fields_ids_map, + new_fields_ids_map, + &pool, + &document_changes, + )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } @@ -1501,10 +1525,11 @@ impl IndexScheduler { } let rtxn = index.read_txn()?; - let mut fields_ids_map = index.fields_ids_map(&rtxn)?; + let db_fields_ids_map = index.fields_ids_map(&rtxn)?; + let mut new_fields_ids_map = db_fields_ids_map.clone(); let primary_key = - retrieve_or_guess_primary_key(&rtxn, index, &mut fields_ids_map, None)? + retrieve_or_guess_primary_key(&rtxn, index, &mut new_fields_ids_map, None)? .unwrap(); if !tasks.iter().all(|res| res.error.is_some()) { @@ -1512,19 +1537,17 @@ impl IndexScheduler { // let pool = indexer_config.thread_pool.unwrap(); let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); - let param = (index, &fields_ids_map, &primary_key); let mut indexer = indexer::DocumentDeletion::new(); indexer.delete_documents_by_docids(to_delete); - /// TODO remove this fields-ids-map, it's useless for the deletion pipeline (the &mut cloned one). - let document_changes = - indexer.document_changes(&mut fields_ids_map.clone(), param)?; - /// TODO pass/write the FieldsIdsMap + let document_changes = indexer.into_changes(&indexer_alloc, primary_key); + indexer::index( index_wtxn, index, - fields_ids_map.clone(), + &db_fields_ids_map, + new_fields_ids_map, &pool, - document_changes, + &document_changes, )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 72f3daa4e..fc522994e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -29,8 +29,8 @@ fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" grenad = { version = "0.4.7", default-features = false, features = [ - "rayon", # TODO Should we keep this feature - "tempfile" + "rayon", # TODO Should we keep this feature + "tempfile", ], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" } heed = { version = "0.20.3", default-features = false, features = [ "serde-json", @@ -81,7 +81,13 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.9" liquid = "0.26.6" -rhai = { version = "1.19.0", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } +rhai = { version = "1.19.0", features = [ + "serde", + "no_module", + "no_custom_syntax", + "no_time", + "sync", +] } arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } rand = "0.8.5" tracing = "0.1.40" @@ -89,6 +95,8 @@ ureq = { version = "2.10.0", features = ["json"] } url = "2.5.2" rayon-par-bridge = "0.1.0" hashbrown = "0.14.5" +raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" } +bumpalo = "3.16.0" thread_local = "1.1.8" [dev-dependencies] diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 036981b65..001e2293a 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -13,8 +13,8 @@ pub use builder::DocumentsBatchBuilder; pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; use obkv::KvReader; pub use primary_key::{ - validate_document_id_value, DocumentIdExtractionError, FieldIdMapper, PrimaryKey, - DEFAULT_PRIMARY_KEY, + validate_document_id_str, validate_document_id_value, DocumentIdExtractionError, FieldIdMapper, + PrimaryKey, DEFAULT_PRIMARY_KEY, }; pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; @@ -96,6 +96,10 @@ impl FieldIdMapper for DocumentsBatchIndex { fn id(&self, name: &str) -> Option { self.id(name) } + + fn name(&self, id: FieldId) -> Option<&str> { + self.name(id) + } } #[derive(Debug, thiserror::Error)] diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs index b6a236623..1662ed2e6 100644 --- a/milli/src/documents/primary_key.rs +++ b/milli/src/documents/primary_key.rs @@ -19,6 +19,21 @@ pub trait FieldIdMapper { /// /// `None` if the field with this name was not found. fn id(&self, name: &str) -> Option; + + fn name(&self, id: FieldId) -> Option<&str>; +} + +impl FieldIdMapper for &T +where + T: FieldIdMapper, +{ + fn id(&self, name: &str) -> Option { + T::id(self, name) + } + + fn name(&self, id: FieldId) -> Option<&str> { + T::name(self, id) + } } /// A type that represent the type of primary key that has been set @@ -190,7 +205,7 @@ fn starts_with(selector: &str, key: &str) -> bool { // FIXME: move to a DocumentId struct -fn validate_document_id(document_id: &str) -> Option<&str> { +pub fn validate_document_id_str(document_id: &str) -> Option<&str> { if !document_id.is_empty() && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) { @@ -202,7 +217,7 @@ fn validate_document_id(document_id: &str) -> Option<&str> { pub fn validate_document_id_value(document_id: Value) -> StdResult { match document_id { - Value::String(string) => match validate_document_id(&string) { + Value::String(string) => match validate_document_id_str(&string) { Some(s) if s.len() == string.len() => Ok(string), Some(s) => Ok(s.to_string()), None => Err(UserError::InvalidDocumentId { document_id: Value::String(string) }), diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 52e02045d..af96f6a86 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -98,6 +98,20 @@ impl crate::documents::FieldIdMapper for FieldsIdsMap { fn id(&self, name: &str) -> Option { self.id(name) } + + fn name(&self, id: FieldId) -> Option<&str> { + self.name(id) + } +} + +pub trait MutFieldIdMapper { + fn insert(&mut self, name: &str) -> Option; +} + +impl MutFieldIdMapper for FieldsIdsMap { + fn insert(&mut self, name: &str) -> Option { + self.insert(name) + } } #[cfg(test)] diff --git a/milli/src/fields_ids_map/global.rs b/milli/src/fields_ids_map/global.rs index 93908aea8..40d7f389b 100644 --- a/milli/src/fields_ids_map/global.rs +++ b/milli/src/fields_ids_map/global.rs @@ -1,6 +1,8 @@ use std::collections::BTreeMap; use std::sync::RwLock; +use super::MutFieldIdMapper; +use crate::documents::FieldIdMapper; use crate::{FieldId, FieldsIdsMap}; /// A fields ids map that can be globally updated to add fields @@ -11,11 +13,21 @@ pub struct GlobalFieldsIdsMap<'indexing> { } #[derive(Debug, Clone)] -struct LocalFieldsIdsMap { +pub struct LocalFieldsIdsMap { names_ids: BTreeMap, ids_names: BTreeMap, } +impl FieldIdMapper for LocalFieldsIdsMap { + fn id(&self, name: &str) -> Option { + self.id(name) + } + + fn name(&self, id: FieldId) -> Option<&str> { + self.name(id) + } +} + impl LocalFieldsIdsMap { fn new(global: &RwLock) -> Self { let global = global.read().unwrap(); @@ -83,4 +95,14 @@ impl<'indexing> GlobalFieldsIdsMap<'indexing> { self.local.name(id) } + + pub fn local_map(&self) -> &LocalFieldsIdsMap { + &self.local + } +} + +impl<'indexing> MutFieldIdMapper for GlobalFieldsIdsMap<'indexing> { + fn insert(&mut self, name: &str) -> Option { + self.id_or_insert(name) + } } diff --git a/milli/src/update/new/document.rs b/milli/src/update/new/document.rs new file mode 100644 index 000000000..96d0e9cca --- /dev/null +++ b/milli/src/update/new/document.rs @@ -0,0 +1,255 @@ +use std::collections::BTreeSet; + +use heed::RoTxn; +use serde_json::value::RawValue; + +use super::document_change::{Entry, Versions}; +use super::{KvReaderFieldId, KvWriterFieldId}; +use crate::documents::FieldIdMapper; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; +use crate::{DocumentId, FieldId, Index, InternalError, Result}; + +/// A view into a document that can represent either the current version from the DB, +/// the update data from payload or other means, or the merged updated version. +/// +/// The 'doc lifetime is meant to live sufficiently for the document to be handled by the extractors. +pub trait Document<'doc> { + /// Iterate over all **top-level** fields of the document, returning their name and raw JSON value. + /// + /// - The returned values *may* contain nested fields. + /// - The `_vectors` field is **ignored** by this method, meaning it is **not returned** by this method. + fn iter_top_level_fields(&self) -> impl Iterator>; +} + +#[derive(Clone, Copy)] +pub struct DocumentFromDb<'t, Mapper: FieldIdMapper> +where + Mapper: FieldIdMapper, +{ + fields_ids_map: &'t Mapper, + content: &'t KvReaderFieldId, +} + +impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { + fn iter_top_level_fields(&self) -> impl Iterator> { + let mut it = self.content.iter(); + + std::iter::from_fn(move || { + let (fid, value) = it.next()?; + + let res = (|| { + let value = + serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?; + + let name = self.fields_ids_map.name(fid).ok_or( + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "getting current document", + }), + )?; + Ok((name, value)) + })(); + + Some(res) + }) + } +} + +impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> { + pub fn new( + docid: DocumentId, + rtxn: &'t RoTxn, + index: &'t Index, + db_fields_ids_map: &'t Mapper, + ) -> Result> { + index.documents.get(rtxn, &docid).map_err(crate::Error::from).map(|reader| { + reader.map(|reader| Self { fields_ids_map: db_fields_ids_map, content: reader }) + }) + } + + fn field_from_fid(&self, fid: FieldId) -> Result> { + Ok(self + .content + .get(fid) + .map(|v| serde_json::from_slice(v).map_err(InternalError::SerdeJson)) + .transpose()?) + } +} + +#[derive(Clone, Copy)] +pub struct DocumentFromVersions<'doc> { + versions: Versions<'doc>, +} + +impl<'doc> DocumentFromVersions<'doc> { + pub fn new(versions: Versions<'doc>) -> Self { + Self { versions } + } +} + +impl<'doc> Document<'doc> for DocumentFromVersions<'doc> { + fn iter_top_level_fields(&self) -> impl Iterator> { + match &self.versions { + Versions::Single(version) => either::Either::Left(version.iter_top_level_fields()), + Versions::Multiple(versions) => { + let mut seen_fields = BTreeSet::new(); + let mut it = versions.iter().rev().flat_map(|version| version.iter()).copied(); + either::Either::Right(std::iter::from_fn(move || loop { + let (name, value) = it.next()?; + + if seen_fields.contains(name) { + continue; + } + seen_fields.insert(name); + return Some(Ok((name, value))); + })) + } + } + } +} + +// used in document from payload +impl<'doc> Document<'doc> for &'doc [Entry<'doc>] { + fn iter_top_level_fields(&self) -> impl Iterator>> { + self.iter().copied().map(|(k, v)| Ok((k, v))) + } +} + +pub struct MergedDocument<'doc, 't, Mapper: FieldIdMapper> { + new_doc: DocumentFromVersions<'doc>, + db: Option>, +} + +impl<'doc, 't, Mapper: FieldIdMapper> MergedDocument<'doc, 't, Mapper> { + pub fn new( + new_doc: DocumentFromVersions<'doc>, + db: Option>, + ) -> Self { + Self { new_doc, db } + } + + pub fn with_db( + docid: DocumentId, + rtxn: &'t RoTxn, + index: &'t Index, + db_fields_ids_map: &'t Mapper, + new_doc: DocumentFromVersions<'doc>, + ) -> Result { + let db = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)?; + Ok(Self { new_doc, db }) + } + + pub fn without_db(new_doc: DocumentFromVersions<'doc>) -> Self { + Self { new_doc, db: None } + } +} + +impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d> + for MergedDocument<'doc, 't, Mapper> +{ + fn iter_top_level_fields(&self) -> impl Iterator> { + let mut new_doc_it = self.new_doc.iter_top_level_fields(); + let mut db_it = self.db.iter().flat_map(|db| db.iter_top_level_fields()); + + std::iter::from_fn(move || { + let mut seen_fields = BTreeSet::new(); + if let Some(next) = new_doc_it.next() { + if let Ok((name, _)) = next { + seen_fields.insert(name); + } + return Some(next); + } + loop { + match db_it.next()? { + Ok((name, value)) => { + if seen_fields.contains(name) { + continue; + } + return Some(Ok((name, value))); + } + Err(err) => return Some(Err(err)), + } + } + }) + } +} + +impl<'doc, D> Document<'doc> for &D +where + D: Document<'doc>, +{ + fn iter_top_level_fields(&self) -> impl Iterator> { + D::iter_top_level_fields(self) + } +} + +/// Turn this document into an obkv, whose fields are indexed by the provided `FieldIdMapper`. +/// +/// The produced obkv is suitable for storing into the documents DB, meaning: +/// +/// - It contains the contains of `_vectors` that are not configured as an embedder +/// - It contains all the top-level fields of the document, with their raw JSON value as value. +/// +/// # Panics +/// +/// - If the document contains a top-level field that is not present in `fields_ids_map`. +/// +pub fn write_to_obkv<'s, 'a, 'b>( + document: &'s impl Document<'s>, + vector_document: Option<()>, + fields_ids_map: &'a impl FieldIdMapper, + mut document_buffer: &'a mut Vec, +) -> Result<&'a KvReaderFieldId> +where + 's: 'a, + 's: 'b, +{ + // will be used in 'inject_vectors + let vectors_value: Box; + + document_buffer.clear(); + let mut unordered_field_buffer = Vec::new(); + unordered_field_buffer.clear(); + + let mut writer = KvWriterFieldId::new(&mut document_buffer); + + for res in document.iter_top_level_fields() { + let (field_name, value) = res?; + let field_id = fields_ids_map.id(field_name).unwrap(); + unordered_field_buffer.push((field_id, value)); + } + + 'inject_vectors: { + let Some(vector_document) = vector_document else { break 'inject_vectors }; + + let Some(vectors_fid) = fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME) else { + break 'inject_vectors; + }; + /* + let mut vectors = BTreeMap::new(); + for (name, entry) in vector_document.iter_vectors() { + if entry.has_configured_embedder { + continue; // we don't write vectors with configured embedder in documents + } + vectors.insert( + name, + serde_json::json!({ + "regenerate": entry.regenerate, + // TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object + "embeddings": entry.embeddings, + }), + ); + } + + vectors_value = serde_json::value::to_raw_value(&vectors).unwrap(); + unordered_field_buffer.push((vectors_fid, &vectors_value));*/ + } + + unordered_field_buffer.sort_by_key(|(fid, _)| *fid); + for (fid, value) in unordered_field_buffer.iter() { + writer.insert(*fid, value.get().as_bytes()).unwrap(); + } + + writer.finish().unwrap(); + Ok(KvReaderFieldId::from_slice(document_buffer)) +} diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index 7be8d1958..a789b32b7 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -1,35 +1,35 @@ use heed::RoTxn; -use obkv::KvReader; +use serde_json::value::RawValue; -use crate::update::new::KvReaderFieldId; -use crate::{DocumentId, FieldId, Index, Result}; +use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument}; +use crate::documents::FieldIdMapper; +use crate::{DocumentId, Index, Result}; -pub enum DocumentChange { +pub enum DocumentChange<'doc> { Deletion(Deletion), - Update(Update), - Insertion(Insertion), + Update(Update<'doc>), + Insertion(Insertion<'doc>), } pub struct Deletion { - pub docid: DocumentId, - pub external_document_id: String, - current: Box, + docid: DocumentId, + external_document_id: String, } -pub struct Update { - pub docid: DocumentId, - pub external_document_id: String, - current: Box, - pub new: Box, +pub struct Update<'doc> { + docid: DocumentId, + external_document_id: String, + new: DocumentFromVersions<'doc>, + has_deletion: bool, } -pub struct Insertion { - pub docid: DocumentId, - pub external_document_id: String, - pub new: Box, +pub struct Insertion<'doc> { + docid: DocumentId, + external_document_id: String, + new: DocumentFromVersions<'doc>, } -impl DocumentChange { +impl<'doc> DocumentChange<'doc> { pub fn docid(&self) -> DocumentId { match &self { Self::Deletion(inner) => inner.docid(), @@ -37,15 +37,19 @@ impl DocumentChange { Self::Insertion(inner) => inner.docid(), } } + + pub fn external_docid(&self) -> &str { + match self { + DocumentChange::Deletion(deletion) => deletion.external_document_id(), + DocumentChange::Update(update) => update.external_document_id(), + DocumentChange::Insertion(insertion) => insertion.external_document_id(), + } + } } impl Deletion { - pub fn create( - docid: DocumentId, - external_document_id: String, - current: Box, - ) -> Self { - Self { docid, external_document_id, current } + pub fn create(docid: DocumentId, external_document_id: String) -> Self { + Self { docid, external_document_id } } pub fn docid(&self) -> DocumentId { @@ -56,21 +60,23 @@ impl Deletion { &self.external_document_id } - // TODO shouldn't we use the one in self? - pub fn current<'a>( + pub fn current<'a, Mapper: FieldIdMapper>( &self, rtxn: &'a RoTxn, index: &'a Index, - ) -> Result>> { - index.documents.get(rtxn, &self.docid).map_err(crate::Error::from) + mapper: &'a Mapper, + ) -> Result> { + Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or( + crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, + )?) } } -impl Insertion { +impl<'doc> Insertion<'doc> { pub fn create( docid: DocumentId, external_document_id: String, - new: Box, + new: DocumentFromVersions<'doc>, ) -> Self { Insertion { docid, external_document_id, new } } @@ -82,20 +88,19 @@ impl Insertion { pub fn external_document_id(&self) -> &str { &self.external_document_id } - - pub fn new(&self) -> &KvReader { - self.new.as_ref() + pub fn new(&self) -> DocumentFromVersions<'doc> { + self.new } } -impl Update { +impl<'doc> Update<'doc> { pub fn create( docid: DocumentId, external_document_id: String, - current: Box, - new: Box, + new: DocumentFromVersions<'doc>, + has_deletion: bool, ) -> Self { - Update { docid, external_document_id, current, new } + Update { docid, new, external_document_id, has_deletion } } pub fn docid(&self) -> DocumentId { @@ -105,16 +110,39 @@ impl Update { pub fn external_document_id(&self) -> &str { &self.external_document_id } - - pub fn current<'a>( + pub fn current<'a, Mapper: FieldIdMapper>( &self, rtxn: &'a RoTxn, index: &'a Index, - ) -> Result>> { - index.documents.get(rtxn, &self.docid).map_err(crate::Error::from) + mapper: &'a Mapper, + ) -> Result> { + Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or( + crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, + )?) } - pub fn new(&self) -> &KvReader { - self.new.as_ref() + pub fn updated(&self) -> DocumentFromVersions<'doc> { + self.new + } + + pub fn new<'a, Mapper: FieldIdMapper>( + &self, + rtxn: &'a RoTxn, + index: &'a Index, + mapper: &'a Mapper, + ) -> Result> { + if self.has_deletion { + Ok(MergedDocument::without_db(self.new)) + } else { + MergedDocument::with_db(self.docid, rtxn, index, mapper, self.new) + } } } + +pub type Entry<'doc> = (&'doc str, &'doc RawValue); + +#[derive(Clone, Copy)] +pub enum Versions<'doc> { + Single(&'doc [Entry<'doc>]), + Multiple(&'doc [&'doc [Entry<'doc>]]), +} diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 8ca9a8b20..a3f05ce0e 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -2,46 +2,90 @@ use std::cell::RefCell; use std::collections::HashSet; use std::fmt::Debug; use std::fs::File; -use std::sync::Arc; +use std::ops::DerefMut as _; +use bumpalo::Bump; use grenad::{MergeFunction, Merger}; use heed::RoTxn; -use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; +use rayon::iter::{ParallelBridge as _, ParallelIterator as _}; use serde_json::Value; -use thread_local::ThreadLocal; use super::super::cache::CboCachedSorter; use super::facet_document::extract_document_facets; use super::FacetKind; use crate::facet::value_encoding::f64_into_bytes; use crate::update::new::extract::DocidsExtractor; -use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; +use crate::update::new::indexer::document_changes::{ + for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, + IndexingContext, ThreadLocal, +}; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{ - DocumentId, Error, FieldId, GlobalFieldsIdsMap, Index, Result, MAX_FACET_VALUE_LENGTH, -}; +use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; + +pub struct FacetedExtractorData<'extractor> { + attributes_to_extract: &'extractor [&'extractor str], + grenad_parameters: GrenadParameters, + max_memory: Option, +} + +impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> { + type Data = FullySend>>; + + fn init_data( + &self, + _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, + ) -> Result { + Ok(FullySend(RefCell::new(CboCachedSorter::new( + // TODO use a better value + 1_000_000.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + self.grenad_parameters.chunk_compression_type, + self.grenad_parameters.chunk_compression_level, + self.grenad_parameters.max_nb_chunks, + self.max_memory, + ), + )))) + } + + fn process( + &self, + change: DocumentChange, + context: &crate::update::new::indexer::document_changes::DocumentChangeContext, + ) -> Result<()> { + FacetedDocidsExtractor::extract_document_change( + &context, + self.attributes_to_extract, + change, + ) + } +} + pub struct FacetedDocidsExtractor; impl FacetedDocidsExtractor { fn extract_document_change( - rtxn: &RoTxn, - index: &Index, - buffer: &mut Vec, - fields_ids_map: &mut GlobalFieldsIdsMap, + context: &DocumentChangeContext< + FullySend>>, + >, attributes_to_extract: &[&str], - cached_sorter: &mut CboCachedSorter, document_change: DocumentChange, ) -> Result<()> { + let index = &context.index; + let rtxn = &context.txn; + let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut(); + let mut cached_sorter = context.data.0.borrow_mut(); match document_change { DocumentChange::Deletion(inner) => extract_document_facets( attributes_to_extract, - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, + inner.current(rtxn, index, context.db_fields_ids_map)?, + new_fields_ids_map.deref_mut(), &mut |fid, value| { Self::facet_fn_with_options( - buffer, - cached_sorter, + &context.doc_alloc, + cached_sorter.deref_mut(), CboCachedSorter::insert_del_u32, inner.docid(), fid, @@ -52,12 +96,12 @@ impl FacetedDocidsExtractor { DocumentChange::Update(inner) => { extract_document_facets( attributes_to_extract, - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, + inner.current(rtxn, index, context.db_fields_ids_map)?, + new_fields_ids_map.deref_mut(), &mut |fid, value| { Self::facet_fn_with_options( - buffer, - cached_sorter, + &context.doc_alloc, + cached_sorter.deref_mut(), CboCachedSorter::insert_del_u32, inner.docid(), fid, @@ -68,12 +112,12 @@ impl FacetedDocidsExtractor { extract_document_facets( attributes_to_extract, - inner.new(), - fields_ids_map, + inner.new(rtxn, index, context.db_fields_ids_map)?, + new_fields_ids_map.deref_mut(), &mut |fid, value| { Self::facet_fn_with_options( - buffer, - cached_sorter, + &context.doc_alloc, + cached_sorter.deref_mut(), CboCachedSorter::insert_add_u32, inner.docid(), fid, @@ -85,11 +129,11 @@ impl FacetedDocidsExtractor { DocumentChange::Insertion(inner) => extract_document_facets( attributes_to_extract, inner.new(), - fields_ids_map, + new_fields_ids_map.deref_mut(), &mut |fid, value| { Self::facet_fn_with_options( - buffer, - cached_sorter, + &context.doc_alloc, + cached_sorter.deref_mut(), CboCachedSorter::insert_add_u32, inner.docid(), fid, @@ -101,7 +145,7 @@ impl FacetedDocidsExtractor { } fn facet_fn_with_options( - buffer: &mut Vec, + doc_alloc: &Bump, cached_sorter: &mut CboCachedSorter, cache_fn: impl Fn(&mut CboCachedSorter, &[u8], u32) -> grenad::Result<(), MF::Error>, docid: DocumentId, @@ -113,9 +157,9 @@ impl FacetedDocidsExtractor { MF::Error: Debug, grenad::Error: Into, { + let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); // Exists // key: fid - buffer.clear(); buffer.push(FacetKind::Exists as u8); buffer.extend_from_slice(&fid.to_be_bytes()); cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into)?; @@ -197,58 +241,38 @@ fn truncate_str(s: &str) -> &str { impl DocidsExtractor for FacetedDocidsExtractor { #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] - fn run_extraction( - index: &Index, - fields_ids_map: &GlobalFieldsIdsMap, - indexer: GrenadParameters, - document_changes: impl IntoParallelIterator< - Item = std::result::Result>, - >, + fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + grenad_parameters: GrenadParameters, + document_changes: &DC, + indexing_context: IndexingContext<'fid, 'indexer, 'index>, + extractor_allocs: &mut ThreadLocal>>, ) -> Result> { - let max_memory = indexer.max_memory_by_thread(); + let max_memory = grenad_parameters.max_memory_by_thread(); + + let index = indexing_context.index; let rtxn = index.read_txn()?; let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; let attributes_to_extract: Vec<_> = attributes_to_extract.iter().map(|s| s.as_ref()).collect(); - let thread_local = ThreadLocal::with_capacity(rayon::current_num_threads()); + let datastore = ThreadLocal::new(); { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - document_changes.into_par_iter().try_arc_for_each_try_init( - || { - thread_local.get_or_try(|| { - let rtxn = index.read_txn().map_err(Error::from)?; - let cache = CboCachedSorter::new( - /// TODO use a better value - 100.try_into().unwrap(), - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ), - ); - Ok((rtxn, RefCell::new((fields_ids_map.clone(), Vec::new(), cache)))) - }) - }, - |(rtxn, rc), document_change| { - let (fields_ids_map, buffer, cached_sorter) = &mut *rc.borrow_mut(); - Self::extract_document_change( - rtxn, - index, - buffer, - fields_ids_map, - &attributes_to_extract, - cached_sorter, - document_change?, - ) - .map_err(Arc::new) - }, + + let extractor = FacetedExtractorData { + attributes_to_extract: &attributes_to_extract, + grenad_parameters, + max_memory, + }; + for_each_document_change( + document_changes, + &extractor, + indexing_context, + extractor_allocs, + &datastore, )?; } { @@ -257,11 +281,11 @@ impl DocidsExtractor for FacetedDocidsExtractor { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); - let readers: Vec<_> = thread_local + let readers: Vec<_> = datastore .into_iter() .par_bridge() - .map(|(_, rc)| { - let (_, _, cached_sorter) = rc.into_inner(); + .map(|cached_sorter| { + let cached_sorter = cached_sorter.0.into_inner(); let sorter = cached_sorter.into_sorter()?; sorter.into_reader_cursors() }) diff --git a/milli/src/update/new/extract/faceted/facet_document.rs b/milli/src/update/new/extract/faceted/facet_document.rs index 4525e866f..cf8984f9c 100644 --- a/milli/src/update/new/extract/faceted/facet_document.rs +++ b/milli/src/update/new/extract/faceted/facet_document.rs @@ -1,24 +1,17 @@ use serde_json::Value; +use crate::update::new::document::Document; use crate::update::new::extract::perm_json_p; -use crate::update::new::KvReaderFieldId; use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; -pub fn extract_document_facets( +pub fn extract_document_facets<'doc>( attributes_to_extract: &[&str], - obkv: &KvReaderFieldId, + document: impl Document<'doc>, field_id_map: &mut GlobalFieldsIdsMap, facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>, ) -> Result<()> { - let mut field_name = String::new(); - for (field_id, field_bytes) in obkv { - let Some(field_name) = field_id_map.name(field_id).map(|s| { - field_name.clear(); - field_name.push_str(s); - &field_name - }) else { - unreachable!("field id not found in field id map"); - }; + for res in document.iter_top_level_fields() { + let (field_name, value) = res?; let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) { Some(field_id) => facet_fn(field_id, value), @@ -28,7 +21,7 @@ pub fn extract_document_facets( // if the current field is searchable or contains a searchable attribute if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) { // parse json. - match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { + match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => perm_json_p::seek_leaf_values_in_object( &object, Some(attributes_to_extract), diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index c12634563..1c86d80af 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -3,26 +3,24 @@ mod faceted; mod lru; mod searchable; +use std::cell::RefCell; use std::fs::File; -use std::sync::Arc; +use bumpalo::Bump; pub use faceted::*; use grenad::Merger; -use rayon::iter::IntoParallelIterator; pub use searchable::*; -use super::DocumentChange; +use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal}; use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{Error, GlobalFieldsIdsMap, Index, Result}; +use crate::Result; pub trait DocidsExtractor { - fn run_extraction( - index: &Index, - fields_ids_map: &GlobalFieldsIdsMap, - indexer: GrenadParameters, - document_changes: impl IntoParallelIterator< - Item = std::result::Result>, - >, + fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + grenad_parameters: GrenadParameters, + document_changes: &DC, + indexing_context: IndexingContext<'fid, 'indexer, 'index>, + extractor_allocs: &mut ThreadLocal>>, ) -> Result>; } diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index dde969614..82bb0ec86 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -2,17 +2,23 @@ use std::cell::RefCell; use std::collections::HashMap; use std::fs::File; use std::num::NonZero; +use std::ops::DerefMut as _; use std::sync::Arc; +use bumpalo::Bump; use grenad::{Merger, MergerBuilder}; use heed::RoTxn; use rayon::iter::IntoParallelIterator; -use thread_local::ThreadLocal; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::SearchableExtractor; +use crate::update::new::document::Document; use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; +use crate::update::new::indexer::document_changes::{ + for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, + IndexingContext, ThreadLocal, +}; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; @@ -23,7 +29,7 @@ use crate::{ const MAX_COUNTED_WORDS: usize = 30; -struct WordDocidsCachedSorters { +pub struct WordDocidsCachedSorters { word_fid_docids: CboCachedSorter, word_docids: CboCachedSorter, exact_word_docids: CboCachedSorter, @@ -301,18 +307,47 @@ impl WordDocidsMergerBuilders { } } +pub struct WordDocidsExtractorData<'extractor> { + tokenizer: &'extractor DocumentTokenizer<'extractor>, + grenad_parameters: GrenadParameters, + max_memory: Option, +} + +impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> { + type Data = FullySend>; + + fn init_data( + &self, + _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, + ) -> Result { + Ok(FullySend(RefCell::new(WordDocidsCachedSorters::new( + self.grenad_parameters, + self.max_memory, + // TODO use a better value + 200_000.try_into().unwrap(), + )))) + } + + fn process( + &self, + change: DocumentChange, + context: &crate::update::new::indexer::document_changes::DocumentChangeContext, + ) -> Result<()> { + WordDocidsExtractors::extract_document_change(context, self.tokenizer, change) + } +} + pub struct WordDocidsExtractors; impl WordDocidsExtractors { - pub fn run_extraction( - index: &Index, - fields_ids_map: &GlobalFieldsIdsMap, - indexer: GrenadParameters, - document_changes: impl IntoParallelIterator< - Item = std::result::Result>, - >, + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + grenad_parameters: GrenadParameters, + document_changes: &DC, + indexing_context: IndexingContext<'fid, 'indexer, 'index>, + extractor_allocs: &mut ThreadLocal>>, ) -> Result { - let max_memory = indexer.max_memory_by_thread(); + let max_memory = grenad_parameters.max_memory_by_thread(); + let index = indexing_context.index; let rtxn = index.read_txn()?; let stop_words = index.stop_words(&rtxn)?; @@ -342,38 +377,25 @@ impl WordDocidsExtractors { max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - let thread_local = ThreadLocal::with_capacity(rayon::current_num_threads()); + let datastore = ThreadLocal::new(); { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - document_changes.into_par_iter().try_arc_for_each_try_init( - || { - thread_local.get_or_try(|| { - let rtxn = index.read_txn().map_err(Error::from)?; - let fields_ids_map = fields_ids_map.clone(); - let cache = WordDocidsCachedSorters::new( - indexer, - max_memory, - // TODO use a better value - 200_000.try_into().unwrap(), - ); - Ok((rtxn, &document_tokenizer, RefCell::new((fields_ids_map, cache)))) - }) - }, - |(rtxn, document_tokenizer, rc), document_change| { - let (fields_ids_map, cached_sorter) = &mut *rc.borrow_mut(); - Self::extract_document_change( - rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, - ) - .map_err(Arc::new) - }, + + let extractor = WordDocidsExtractorData { + tokenizer: &document_tokenizer, + grenad_parameters, + max_memory, + }; + + for_each_document_change( + document_changes, + &extractor, + indexing_context, + extractor_allocs, + &datastore, )?; } @@ -382,8 +404,7 @@ impl WordDocidsExtractors { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); let mut builder = WordDocidsMergerBuilders::new(); - for (_, _, rc) in thread_local.into_iter() { - let (_, cache) = rc.into_inner(); + for cache in datastore.into_iter().map(|cache| cache.0.into_inner()) { builder.add_sorters(cache)?; } @@ -392,13 +413,17 @@ impl WordDocidsExtractors { } fn extract_document_change( - rtxn: &RoTxn, - index: &Index, + context: &DocumentChangeContext>>, document_tokenizer: &DocumentTokenizer, - fields_ids_map: &mut GlobalFieldsIdsMap, - cached_sorter: &mut WordDocidsCachedSorters, document_change: DocumentChange, ) -> Result<()> { + let index = &context.index; + let rtxn = &context.txn; + let mut cached_sorter = context.data.0.borrow_mut(); + let cached_sorter = cached_sorter.deref_mut(); + let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut(); + let new_fields_ids_map = new_fields_ids_map.deref_mut(); + let exact_attributes = index.exact_attributes(rtxn)?; let is_exact_attribute = |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr)); @@ -418,8 +443,8 @@ impl WordDocidsExtractors { .map_err(crate::Error::from) }; document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, + inner.current(rtxn, index, context.db_fields_ids_map)?, + new_fields_ids_map, &mut token_fn, )?; } @@ -437,8 +462,8 @@ impl WordDocidsExtractors { .map_err(crate::Error::from) }; document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, + inner.current(rtxn, index, context.db_fields_ids_map)?, + new_fields_ids_map, &mut token_fn, )?; @@ -454,7 +479,11 @@ impl WordDocidsExtractors { ) .map_err(crate::Error::from) }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + document_tokenizer.tokenize_document( + inner.new(rtxn, index, context.db_fields_ids_map)?, + new_fields_ids_map, + &mut token_fn, + )?; } DocumentChange::Insertion(inner) => { let mut token_fn = |fname: &str, fid, pos, word: &str| { @@ -469,7 +498,11 @@ impl WordDocidsExtractors { ) .map_err(crate::Error::from) }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + document_tokenizer.tokenize_document( + inner.new(), + new_fields_ids_map, + &mut token_fn, + )?; } } diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 7d3655be8..d47ab606c 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -1,13 +1,17 @@ +use std::cell::RefCell; use std::collections::VecDeque; use std::rc::Rc; +use bumpalo::Bump; use heed::RoTxn; use obkv::KvReader; use super::tokenize_document::DocumentTokenizer; use super::SearchableExtractor; use crate::proximity::{index_proximity, MAX_DISTANCE}; +use crate::update::new::document::Document; use crate::update::new::extract::cache::CboCachedSorter; +use crate::update::new::indexer::document_changes::{DocumentChangeContext, FullySend}; use crate::update::new::DocumentChange; use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; @@ -28,27 +32,39 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { // This method is reimplemented to count the number of words in the document in each field // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS. fn extract_document_change( - rtxn: &RoTxn, - index: &Index, + context: &DocumentChangeContext< + FullySend>>, + >, document_tokenizer: &DocumentTokenizer, - fields_ids_map: &mut GlobalFieldsIdsMap, - cached_sorter: &mut CboCachedSorter, document_change: DocumentChange, ) -> Result<()> { - let mut key_buffer = Vec::new(); - let mut del_word_pair_proximity = Vec::new(); - let mut add_word_pair_proximity = Vec::new(); + let doc_alloc = &context.doc_alloc; + + let index = context.index; + let rtxn = &context.txn; + + let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc); + let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc); + let mut add_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc); + + let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut(); + let new_fields_ids_map = &mut *new_fields_ids_map; + + let mut cached_sorter = context.data.0.borrow_mut(); + let cached_sorter = &mut *cached_sorter; + + // is a vecdequeue, and will be smol, so can stay on the heap for now let mut word_positions: VecDeque<(Rc, u16)> = VecDeque::with_capacity(MAX_DISTANCE as usize); let docid = document_change.docid(); match document_change { DocumentChange::Deletion(inner) => { - let document = inner.current(rtxn, index)?.unwrap(); + let document = inner.current(rtxn, index, context.db_fields_ids_map)?; process_document_tokens( document, document_tokenizer, - fields_ids_map, + new_fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { del_word_pair_proximity.push(((w1, w2), prox)); @@ -56,21 +72,21 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { )?; } DocumentChange::Update(inner) => { - let document = inner.current(rtxn, index)?.unwrap(); + let document = inner.current(rtxn, index, context.db_fields_ids_map)?; process_document_tokens( document, document_tokenizer, - fields_ids_map, + new_fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { del_word_pair_proximity.push(((w1, w2), prox)); }, )?; - let document = inner.new(); + let document = inner.new(rtxn, index, context.db_fields_ids_map)?; process_document_tokens( document, document_tokenizer, - fields_ids_map, + new_fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { add_word_pair_proximity.push(((w1, w2), prox)); @@ -82,7 +98,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { process_document_tokens( document, document_tokenizer, - fields_ids_map, + new_fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { add_word_pair_proximity.push(((w1, w2), prox)); @@ -108,7 +124,12 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { } } -fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec) -> &'a [u8] { +fn build_key<'a>( + prox: u8, + w1: &str, + w2: &str, + key_buffer: &'a mut bumpalo::collections::Vec, +) -> &'a [u8] { key_buffer.clear(); key_buffer.push(prox); key_buffer.extend_from_slice(w1.as_bytes()); @@ -131,8 +152,8 @@ fn word_positions_into_word_pair_proximity( Ok(()) } -fn process_document_tokens( - document: &KvReader, +fn process_document_tokens<'doc>( + document: impl Document<'doc>, document_tokenizer: &DocumentTokenizer, fields_ids_map: &mut GlobalFieldsIdsMap, word_positions: &mut VecDeque<(Rc, u16)>, diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index a261efda3..758b3b6a1 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -4,40 +4,81 @@ mod tokenize_document; use std::cell::RefCell; use std::fs::File; -use std::sync::Arc; +use std::marker::PhantomData; +use std::ops::DerefMut; +use bumpalo::Bump; pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers}; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; use grenad::Merger; use heed::RoTxn; -use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; -use thread_local::ThreadLocal; +use rayon::iter::{ParallelBridge, ParallelIterator}; use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; use super::DocidsExtractor; -use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; +use crate::update::new::indexer::document_changes::{ + for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, + IndexingContext, ThreadLocal, +}; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{Error, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; -pub trait SearchableExtractor { - fn run_extraction( - index: &Index, - fields_ids_map: &GlobalFieldsIdsMap, - indexer: GrenadParameters, - document_changes: impl IntoParallelIterator< - Item = std::result::Result>, - >, +pub struct SearchableExtractorData<'extractor, EX: SearchableExtractor> { + tokenizer: &'extractor DocumentTokenizer<'extractor>, + grenad_parameters: GrenadParameters, + max_memory: Option, + _ex: PhantomData, +} + +impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> + for SearchableExtractorData<'extractor, EX> +{ + type Data = FullySend>>; + + fn init_data( + &self, + _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, + ) -> Result { + Ok(FullySend(RefCell::new(CboCachedSorter::new( + // TODO use a better value + 1_000_000.try_into().unwrap(), + create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdCboRoaringBitmaps, + self.grenad_parameters.chunk_compression_type, + self.grenad_parameters.chunk_compression_level, + self.grenad_parameters.max_nb_chunks, + self.max_memory, + ), + )))) + } + + fn process( + &self, + change: DocumentChange, + context: &crate::update::new::indexer::document_changes::DocumentChangeContext, + ) -> Result<()> { + EX::extract_document_change(context, self.tokenizer, change) + } +} + +pub trait SearchableExtractor: Sized + Sync { + fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + grenad_parameters: GrenadParameters, + document_changes: &DC, + indexing_context: IndexingContext<'fid, 'indexer, 'index>, + extractor_allocs: &mut ThreadLocal>>, ) -> Result> { - let max_memory = indexer.max_memory_by_thread(); + let max_memory = grenad_parameters.max_memory_by_thread(); - let rtxn = index.read_txn()?; - let stop_words = index.stop_words(&rtxn)?; - let allowed_separators = index.allowed_separators(&rtxn)?; + let rtxn = indexing_context.index.read_txn()?; + let stop_words = indexing_context.index.stop_words(&rtxn)?; + let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; let allowed_separators: Option> = allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let dictionary = index.dictionary(&rtxn)?; + let dictionary = indexing_context.index.dictionary(&rtxn)?; let dictionary: Option> = dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); let builder = tokenizer_builder( @@ -47,10 +88,10 @@ pub trait SearchableExtractor { ); let tokenizer = builder.into_tokenizer(); - let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; - let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?; + let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?; + let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?; let localized_attributes_rules = - index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); + indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); let document_tokenizer = DocumentTokenizer { tokenizer: &tokenizer, @@ -60,48 +101,26 @@ pub trait SearchableExtractor { max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - let thread_local = ThreadLocal::with_capacity(rayon::current_num_threads()); + let extractor_data: SearchableExtractorData = SearchableExtractorData { + tokenizer: &document_tokenizer, + grenad_parameters, + max_memory, + _ex: PhantomData, + }; + + let datastore = ThreadLocal::new(); { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - document_changes.into_par_iter().try_arc_for_each_try_init( - || { - thread_local.get_or_try(|| { - let rtxn = index.read_txn().map_err(Error::from)?; - let cache = CboCachedSorter::new( - /// TODO use a better value - 1_000_000.try_into().unwrap(), - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ), - ); - Ok(( - rtxn, - &document_tokenizer, - RefCell::new((fields_ids_map.clone(), cache)), - )) - }) - }, - |(rtxn, document_tokenizer, rc), document_change| { - let (fields_ids_map, cached_sorter) = &mut *rc.borrow_mut(); - Self::extract_document_change( - rtxn, - index, - document_tokenizer, - fields_ids_map, - cached_sorter, - document_change?, - ) - .map_err(Arc::new) - }, - )?; + for_each_document_change( + document_changes, + &extractor_data, + indexing_context, + extractor_allocs, + &datastore, + ); } { let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); @@ -109,11 +128,14 @@ pub trait SearchableExtractor { tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); let _entered = span.enter(); - let readers: Vec<_> = thread_local + let readers: Vec<_> = datastore .into_iter() .par_bridge() - .map(|(_, _, rc)| { - let (_, cached_sorter) = rc.into_inner(); + .map(|cache_entry| { + let cached_sorter: FullySend< + RefCell>, + > = cache_entry; + let cached_sorter = cached_sorter.0.into_inner(); let sorter = cached_sorter.into_sorter()?; sorter.into_reader_cursors() }) @@ -122,16 +144,16 @@ pub trait SearchableExtractor { for reader in readers { builder.extend(reader?); } + Ok(builder.build()) } } fn extract_document_change( - rtxn: &RoTxn, - index: &Index, + context: &DocumentChangeContext< + FullySend>>, + >, document_tokenizer: &DocumentTokenizer, - fields_ids_map: &mut GlobalFieldsIdsMap, - cached_sorter: &mut CboCachedSorter, document_change: DocumentChange, ) -> Result<()>; @@ -142,14 +164,17 @@ pub trait SearchableExtractor { } impl DocidsExtractor for T { - fn run_extraction( - index: &Index, - fields_ids_map: &GlobalFieldsIdsMap, - indexer: GrenadParameters, - document_changes: impl IntoParallelIterator< - Item = std::result::Result>, - >, + fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + grenad_parameters: GrenadParameters, + document_changes: &DC, + indexing_context: IndexingContext<'fid, 'indexer, 'index>, + extractor_allocs: &mut ThreadLocal>>, ) -> Result> { - Self::run_extraction(index, fields_ids_map, indexer, document_changes) + Self::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + extractor_allocs, + ) } } diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index fda619013..71585c8d2 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -4,6 +4,7 @@ use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use serde_json::Value; use crate::proximity::MAX_DISTANCE; +use crate::update::new::document::Document; use crate::update::new::extract::perm_json_p::{ seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, }; @@ -22,22 +23,16 @@ pub struct DocumentTokenizer<'a> { } impl<'a> DocumentTokenizer<'a> { - pub fn tokenize_document( + pub fn tokenize_document<'doc>( &self, - obkv: &KvReaderFieldId, + document: impl Document<'doc>, field_id_map: &mut GlobalFieldsIdsMap, token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>, ) -> Result<()> { let mut field_position = HashMap::new(); - let mut field_name = String::new(); - for (field_id, field_bytes) in obkv { - let Some(field_name) = field_id_map.name(field_id).map(|s| { - field_name.clear(); - field_name.push_str(s); - &field_name - }) else { - unreachable!("field id not found in field id map"); - }; + + for entry in document.iter_top_level_fields() { + let (field_name, value) = entry?; let mut tokenize_field = |name: &str, value: &Value| { let Some(field_id) = field_id_map.id_or_insert(name) else { @@ -94,7 +89,7 @@ impl<'a> DocumentTokenizer<'a> { // if the current field is searchable or contains a searchable attribute if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) { // parse json. - match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { + match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => seek_leaf_values_in_object( &object, self.attribute_to_extract, @@ -174,10 +169,13 @@ pub fn tokenizer_builder<'a>( #[cfg(test)] mod test { + use bumpalo::Bump; use charabia::TokenizerBuilder; use meili_snap::snapshot; use obkv::KvReader; + use raw_collections::RawMap; use serde_json::json; + use serde_json::value::RawValue; use super::*; use crate::FieldsIdsMap; @@ -186,40 +184,25 @@ mod test { fn test_tokenize_document() { let mut fields_ids_map = FieldsIdsMap::new(); - let field_1 = json!({ - "name": "doggo", - "age": 10, - }); - - let field_2 = json!({ + let document = json!({ + "doggo": { "name": "doggo", + "age": 10,}, + "catto": { "catto": { "name": "pesti", "age": 23, } + }, + "doggo.name": ["doggo", "catto"], + "not-me": "UNSEARCHABLE", + "me-nether": {"nope": "unsearchable"} }); - let field_3 = json!(["doggo", "catto"]); - let field_4 = json!("UNSEARCHABLE"); - let field_5 = json!({"nope": "unsearchable"}); - - let mut obkv = obkv::KvWriter::memory(); - let field_1_id = fields_ids_map.insert("doggo").unwrap(); - let field_1 = serde_json::to_string(&field_1).unwrap(); - obkv.insert(field_1_id, field_1.as_bytes()).unwrap(); - let field_2_id = fields_ids_map.insert("catto").unwrap(); - let field_2 = serde_json::to_string(&field_2).unwrap(); - obkv.insert(field_2_id, field_2.as_bytes()).unwrap(); - let field_3_id = fields_ids_map.insert("doggo.name").unwrap(); - let field_3 = serde_json::to_string(&field_3).unwrap(); - obkv.insert(field_3_id, field_3.as_bytes()).unwrap(); - let field_4_id = fields_ids_map.insert("not-me").unwrap(); - let field_4 = serde_json::to_string(&field_4).unwrap(); - obkv.insert(field_4_id, field_4.as_bytes()).unwrap(); - let field_5_id = fields_ids_map.insert("me-nether").unwrap(); - let field_5 = serde_json::to_string(&field_5).unwrap(); - obkv.insert(field_5_id, field_5.as_bytes()).unwrap(); - let value = obkv.into_inner().unwrap(); - let obkv = KvReader::from_slice(value.as_slice()); + let _field_1_id = fields_ids_map.insert("doggo").unwrap(); + let _field_2_id = fields_ids_map.insert("catto").unwrap(); + let _field_3_id = fields_ids_map.insert("doggo.name").unwrap(); + let _field_4_id = fields_ids_map.insert("not-me").unwrap(); + let _field_5_id = fields_ids_map.insert("me-nether").unwrap(); let mut tb = TokenizerBuilder::default(); let document_tokenizer = DocumentTokenizer { @@ -234,11 +217,23 @@ mod test { let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); let mut words = std::collections::BTreeMap::new(); + + let document = document.to_string(); + + let bump = Bump::new(); + let document: &RawValue = serde_json::from_str(&document).unwrap(); + let document = RawMap::from_raw_value(document, &bump).unwrap(); + let document = document.into_bump_slice(); + document_tokenizer - .tokenize_document(obkv, &mut global_fields_ids_map, &mut |_fname, fid, pos, word| { - words.insert([fid, pos], word.to_string()); - Ok(()) - }) + .tokenize_document( + document, + &mut global_fields_ids_map, + &mut |_fname, fid, pos, word| { + words.insert([fid, pos], word.to_string()); + Ok(()) + }, + ) .unwrap(); snapshot!(format!("{:#?}", words), @r###" diff --git a/milli/src/update/new/indexer/de.rs b/milli/src/update/new/indexer/de.rs new file mode 100644 index 000000000..749588c86 --- /dev/null +++ b/milli/src/update/new/indexer/de.rs @@ -0,0 +1,163 @@ +use bumpalo::Bump; +use serde_json::value::RawValue; + +use crate::documents::{validate_document_id_str, DocumentIdExtractionError, PrimaryKey}; +use crate::fields_ids_map::MutFieldIdMapper; +use crate::{FieldId, UserError}; + +// visits a document to fill the top level fields of the field id map and retrieve the external document id. +pub struct DocumentVisitor<'p, 'indexer, Mapper: MutFieldIdMapper> { + fields_ids_map: &'p mut Mapper, + primary_key: &'p PrimaryKey<'p>, + indexer: &'indexer Bump, +} + +impl<'p, 'indexer, Mapper: MutFieldIdMapper> DocumentVisitor<'p, 'indexer, Mapper> { + pub fn new( + fields_ids_map: &'p mut Mapper, + primary_key: &'p PrimaryKey<'p>, + indexer: &'indexer Bump, + ) -> Self { + Self { fields_ids_map, primary_key, indexer } + } +} + +impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> + for DocumentVisitor<'p, 'indexer, Mapper> +{ + type Value = std::result::Result<&'de str, DocumentIdExtractionError>; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "a map") + } + + fn visit_map(mut self, mut map: A) -> std::result::Result + where + A: serde::de::MapAccess<'de>, + { + let mut docid = None; + while let Some((fid, fields_ids_map)) = + map.next_key_seed(FieldIdMapSeed(self.fields_ids_map))? + { + use serde::de::Deserializer as _; + self.fields_ids_map = fields_ids_map; + /// FIXME unwrap => too many fields + let fid = fid.unwrap(); + + match self.primary_key { + PrimaryKey::Flat { name, field_id } => { + let value: &'de RawValue = map.next_value()?; + if fid == *field_id { + let value = match value + .deserialize_any(DocumentIdVisitor(self.indexer)) + .map_err(|_err| { + DocumentIdExtractionError::InvalidDocumentId( + UserError::InvalidDocumentId { + document_id: serde_json::to_value(value).unwrap(), + }, + ) + }) { + Ok(Ok(value)) => value, + Ok(Err(err)) | Err(err) => return Ok(Err(err)), + }; + if let Some(_previous_value) = docid.replace(value) { + return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(2))); + } + } + } + PrimaryKey::Nested { name } => todo!(), + } + } + Ok(match docid { + Some(docid) => Ok(docid), + None => Err(DocumentIdExtractionError::MissingDocumentId), + }) + } +} + +struct FieldIdMapSeed<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper); + +impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::DeserializeSeed<'de> + for FieldIdMapSeed<'a, Mapper> +{ + type Value = (Option, &'a mut Mapper); + + fn deserialize(self, deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct FieldIdMapVisitor<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper); + impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> for FieldIdMapVisitor<'a, Mapper> { + type Value = (Option, &'a mut Mapper); + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "expecting a string") + } + fn visit_borrowed_str(self, v: &'de str) -> std::result::Result + where + E: serde::de::Error, + { + Ok((self.0.insert(v), self.0)) + } + + fn visit_str(self, v: &str) -> std::result::Result + where + E: serde::de::Error, + { + Ok((self.0.insert(v), self.0)) + } + } + deserializer.deserialize_str(FieldIdMapVisitor(self.0)) + } +} + +struct DocumentIdVisitor<'indexer>(&'indexer Bump); + +impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> { + type Value = std::result::Result<&'de str, DocumentIdExtractionError>; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "an integer or a string") + } + + fn visit_borrowed_str(self, v: &'de str) -> std::result::Result + where + E: serde::de::Error, + { + Ok(validate_document_id_str(v).ok_or_else(|| { + DocumentIdExtractionError::InvalidDocumentId(UserError::InvalidDocumentId { + document_id: serde_json::Value::String(v.to_owned()), + }) + })) + } + + fn visit_str(self, v: &str) -> std::result::Result + where + E: serde::de::Error, + { + let v = self.0.alloc_str(v); + self.visit_borrowed_str(v) + } + + fn visit_u64(self, v: u64) -> std::result::Result + where + E: serde::de::Error, + { + use std::fmt::Write as _; + + let mut out = bumpalo::collections::String::new_in(&self.0); + write!(&mut out, "{v}"); + Ok(Ok(out.into_bump_str())) + } + + fn visit_i64(self, v: i64) -> std::result::Result + where + E: serde::de::Error, + { + use std::fmt::Write as _; + + let mut out = bumpalo::collections::String::new_in(&self.0); + write!(&mut out, "{v}"); + Ok(Ok(out.into_bump_str())) + } +} diff --git a/milli/src/update/new/indexer/document_changes.rs b/milli/src/update/new/indexer/document_changes.rs new file mode 100644 index 000000000..8bab9903f --- /dev/null +++ b/milli/src/update/new/indexer/document_changes.rs @@ -0,0 +1,378 @@ +use std::cell::{Cell, RefCell}; +use std::sync::{Arc, RwLock}; + +use bumpalo::Bump; +use heed::RoTxn; +use raw_collections::alloc::RefBump; +use rayon::iter::IndexedParallelIterator; + +use super::super::document_change::DocumentChange; +use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; +use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result}; + +/// A trait for types that are **not** [`Send`] only because they would then allow concurrent access to a type that is not [`Sync`]. +/// +/// The primary example of such a type is `&T`, with `T: !Sync`. +/// +/// In the authors' understanding, a type can be `!Send` for two distinct reasons: +/// +/// 1. Because it contains data that *genuinely* cannot be moved between threads, such as thread-local data. +/// 2. Because sending the type would allow concurrent access to a `!Sync` type, which is undefined behavior. +/// +/// `MostlySend` exists to be used in bounds where you need a type whose data is **not** *attached* to a thread +/// because you might access it from a different thread, but where you will never access the type **concurrently** from +/// multiple threads. +/// +/// Like [`Send`], `MostlySend` assumes properties on types that cannot be verified by the compiler, which is why implementing +/// this trait is unsafe. +/// +/// # Safety +/// +/// Implementers of this trait promises that the following properties hold on the implementing type: +/// +/// 1. Its data can be accessed from any thread and will be the same regardless of the thread accessing it. +/// 2. Any operation that can be performed on the type does not depend on the thread that executes it. +/// +/// As these properties are subtle and are not generally tracked by the Rust type system, great care should be taken before +/// implementing `MostlySend` on a type, especially a foreign type. +/// +/// - An example of a type that verifies (1) and (2) is [`std::rc::Rc`] (when `T` is `Send` and `Sync`). +/// - An example of a type that doesn't verify (1) is thread-local data. +/// - An example of a type that doesn't verify (2) is [`std::sync::MutexGuard`]: a lot of mutex implementations require that +/// a lock is returned to the operating system on the same thread that initially locked the mutex, failing to uphold this +/// invariant will cause Undefined Behavior +/// (see last § in [the nomicon](https://doc.rust-lang.org/nomicon/send-and-sync.html)). +/// +/// It is **always safe** to implement this trait on a type that is `Send`, but no placeholder impl is provided due to limitations in +/// coherency. Use the [`FullySend`] wrapper in this situation. +pub unsafe trait MostlySend {} + +#[derive(Debug, Clone, Copy, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct FullySend(pub T); + +// SAFETY: a type **fully** send is always mostly send as well. +unsafe impl MostlySend for FullySend where T: Send {} + +impl FullySend { + pub fn into(self) -> T { + self.0 + } +} + +impl From for FullySend { + fn from(value: T) -> Self { + Self(value) + } +} + +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct MostlySendWrapper(T); + +impl MostlySendWrapper { + /// # Safety + /// + /// - (P1) Users of this type will never access the type concurrently from multiple threads without synchronization + unsafe fn new(t: T) -> Self { + Self(t) + } + + fn new_send(t: T) -> Self + where + T: Send, + { + Self(t) + } + + fn get(&self) -> T + where + T: Copy, + { + self.0 + } + + fn as_ref(&self) -> &T { + &self.0 + } + + fn as_mut(&mut self) -> &mut T { + &mut self.0 + } + + fn into_inner(self) -> T { + self.0 + } +} + +/// # Safety +/// +/// 1. `T` is [`MostlySend`], so by its safety contract it can be accessed by any thread and all of its operations are available +/// from any thread. +/// 2. (P1) of `MostlySendWrapper::new` forces the user to never access the value from multiple threads concurrently. +unsafe impl Send for MostlySendWrapper {} + +/// A wrapper around [`thread_local::ThreadLocal`] that accepts [`MostlySend`] `T`s. +pub struct ThreadLocal { + inner: thread_local::ThreadLocal>, + // FIXME: this should be necessary + //_no_send: PhantomData<*mut ()>, +} + +impl ThreadLocal { + pub fn new() -> Self { + Self { inner: thread_local::ThreadLocal::new() } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { inner: thread_local::ThreadLocal::with_capacity(capacity) } + } + + pub fn clear(&mut self) { + self.inner.clear() + } + + pub fn get(&self) -> Option<&T> { + self.inner.get().map(|t| t.as_ref()) + } + + pub fn get_or(&self, create: F) -> &T + where + F: FnOnce() -> T, + { + self.inner.get_or(|| unsafe { MostlySendWrapper::new(create()) }).as_ref() + } + + pub fn get_or_try(&self, create: F) -> std::result::Result<&T, E> + where + F: FnOnce() -> std::result::Result, + { + self.inner + .get_or_try(|| unsafe { Ok(MostlySendWrapper::new(create()?)) }) + .map(MostlySendWrapper::as_ref) + } + + pub fn get_or_default(&self) -> &T + where + T: Default, + { + self.inner.get_or_default().as_ref() + } + + pub fn iter_mut(&mut self) -> IterMut { + IterMut(self.inner.iter_mut()) + } +} + +impl IntoIterator for ThreadLocal { + type Item = T; + + type IntoIter = IntoIter; + + fn into_iter(self) -> Self::IntoIter { + IntoIter(self.inner.into_iter()) + } +} + +pub struct IterMut<'a, T: MostlySend>(thread_local::IterMut<'a, MostlySendWrapper>); + +impl<'a, T: MostlySend> Iterator for IterMut<'a, T> { + type Item = &'a mut T; + + fn next(&mut self) -> Option { + self.0.next().map(|t| t.as_mut()) + } +} + +pub struct IntoIter(thread_local::IntoIter>); + +impl Iterator for IntoIter { + type Item = T; + + fn next(&mut self) -> Option { + self.0.next().map(|t| t.into_inner()) + } +} + +pub struct DocumentChangeContext< + 'doc, // covariant lifetime of a single `process` call + 'extractor: 'doc, // invariant lifetime of the extractor_allocs + 'fid: 'doc, // invariant lifetime of the new_fields_ids_map + 'indexer: 'doc, // covariant lifetime of objects that outlive a single `process` call + T: MostlySend, +> { + /// The index we're indexing in + pub index: &'indexer Index, + /// The fields ids map as it was at the start of this indexing process. Contains at least all top-level fields from documents + /// inside of the DB. + pub db_fields_ids_map: &'indexer FieldsIdsMap, + /// A transaction providing data from the DB before all indexing operations + pub txn: RoTxn<'indexer>, + + /// Global field id map that is up to date with the current state of the indexing process. + /// + /// - Inserting a field will take a lock + /// - Retrieving a field may take a lock as well + pub new_fields_ids_map: &'doc std::cell::RefCell>, + + /// Data allocated in this allocator is cleared between each call to `process`. + pub doc_alloc: Bump, + + /// Data allocated in this allocator is not cleared between each call to `process`, unless the data spills. + pub extractor_alloc: RefBump<'extractor>, + + /// Pool of doc allocators, used to retrieve the doc allocator we provided for the documents + doc_allocs: &'doc ThreadLocal>>, + + /// Extractor-specific data + pub data: &'doc T, +} + +impl< + 'doc, // covariant lifetime of a single `process` call + 'data: 'doc, // invariant on T lifetime of the datastore + 'extractor: 'doc, // invariant lifetime of extractor_allocs + 'fid: 'doc, // invariant lifetime of fields ids map + 'indexer: 'doc, // covariant lifetime of objects that survive a `process` call + T: MostlySend, + > DocumentChangeContext<'doc, 'extractor, 'fid, 'indexer, T> +{ + pub fn new( + index: &'indexer Index, + db_fields_ids_map: &'indexer FieldsIdsMap, + new_fields_ids_map: &'fid RwLock, + extractor_allocs: &'extractor ThreadLocal>>, + doc_allocs: &'doc ThreadLocal>>, + datastore: &'data ThreadLocal, + fields_ids_map_store: &'doc ThreadLocal>>>, + init_data: F, + ) -> Result + where + F: FnOnce(RefBump<'extractor>) -> Result, + { + let doc_alloc = + doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024)))); + let doc_alloc = doc_alloc.0.take(); + let fields_ids_map = fields_ids_map_store + .get_or(|| RefCell::new(GlobalFieldsIdsMap::new(&new_fields_ids_map)).into()); + + let fields_ids_map = &fields_ids_map.0; + let extractor_alloc = extractor_allocs.get_or_default(); + + let extractor_alloc = RefBump::new(extractor_alloc.0.borrow()); + + let data = datastore.get_or_try(|| init_data(RefBump::clone(&extractor_alloc)))?; + + let txn = index.read_txn()?; + Ok(DocumentChangeContext { + index, + txn, + db_fields_ids_map, + new_fields_ids_map: fields_ids_map, + doc_alloc, + extractor_alloc, + data, + doc_allocs, + }) + } +} + +/// An internal iterator (i.e. using `foreach`) of `DocumentChange`s +pub trait Extractor<'extractor>: Sync { + type Data: MostlySend; + + fn init_data<'doc>(&'doc self, extractor_alloc: RefBump<'extractor>) -> Result; + + fn process<'doc>( + &'doc self, + change: DocumentChange<'doc>, + context: &'doc DocumentChangeContext, + ) -> Result<()>; +} + +pub trait DocumentChanges<'pl // lifetime of the underlying payload +>: Sync { + type Item; + + fn iter(&self) -> impl IndexedParallelIterator; + + fn item_to_document_change<'doc, // lifetime of a single `process` call + T: MostlySend>( + &'doc self, + context: &'doc DocumentChangeContext, + item: Self::Item, + ) -> Result> where 'pl: 'doc // the payload must survive the process calls + ; +} + +#[derive(Clone, Copy)] +pub struct IndexingContext< + 'fid, // invariant lifetime of fields ids map + 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation + 'index, // covariant lifetime of the index +> { + pub index: &'index Index, + pub db_fields_ids_map: &'indexer FieldsIdsMap, + pub new_fields_ids_map: &'fid RwLock, + pub doc_allocs: &'indexer ThreadLocal>>, + pub fields_ids_map_store: &'indexer ThreadLocal>>>, +} + +pub fn for_each_document_change< + 'pl, // covariant lifetime of the underlying payload + 'extractor, // invariant lifetime of extractor_alloc + 'fid, // invariant lifetime of fields ids map + 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing + 'data, // invariant on EX::Data lifetime of datastore + 'index, // covariant lifetime of the index + EX, + DC: DocumentChanges<'pl>, +>( + document_changes: &DC, + extractor: &EX, + IndexingContext { + index, + db_fields_ids_map, + new_fields_ids_map, + doc_allocs, + fields_ids_map_store, + }: IndexingContext<'fid, 'indexer, 'index>, + extractor_allocs: &'extractor mut ThreadLocal>>, + datastore: &'data ThreadLocal, +) -> Result<()> +where + EX: Extractor<'extractor>, +{ + // Clean up and reuse the extractor allocs + for extractor_alloc in extractor_allocs.iter_mut() { + extractor_alloc.0.get_mut().reset(); + } + + let pi = document_changes.iter(); + pi.try_arc_for_each_try_init( + || { + DocumentChangeContext::new( + index, + db_fields_ids_map, + new_fields_ids_map, + extractor_allocs, + doc_allocs, + datastore, + fields_ids_map_store, + move |index_alloc| extractor.init_data(index_alloc), + ) + }, + |context, item| { + // Clean up and reuse the document-specific allocator + context.doc_alloc.reset(); + + let change = + document_changes.item_to_document_change(context, item).map_err(Arc::new)?; + + let res = extractor.process(change, context).map_err(Arc::new); + + // send back the doc_alloc in the pool + context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc)); + + res + }, + ) +} diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index 9dbc4e52d..cafc59221 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -1,14 +1,14 @@ -use std::sync::Arc; - -use rayon::iter::{IndexedParallelIterator, IntoParallelIterator}; +use bumpalo::collections::CollectIn; +use bumpalo::Bump; +use rayon::iter::{IntoParallelIterator, ParallelIterator as _}; use roaring::RoaringBitmap; -use super::DocumentChanges; +use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; use crate::documents::PrimaryKey; use crate::index::db_name::EXTERNAL_DOCUMENTS_IDS; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::update::new::{Deletion, DocumentChange}; -use crate::{Error, FieldsIdsMap, Index, InternalError, Result}; +use crate::{DocumentId, InternalError, Result}; pub struct DocumentDeletion { pub to_delete: RoaringBitmap, @@ -22,38 +22,163 @@ impl DocumentDeletion { pub fn delete_documents_by_docids(&mut self, docids: RoaringBitmap) { self.to_delete |= docids; } -} -impl<'p> DocumentChanges<'p> for DocumentDeletion { - type Parameter = (&'p Index, &'p FieldsIdsMap, &'p PrimaryKey<'p>); - - fn document_changes( + pub fn into_changes<'indexer>( self, - _fields_ids_map: &mut FieldsIdsMap, - param: Self::Parameter, - ) -> Result< - impl IndexedParallelIterator>> - + Clone - + 'p, - > { - let (index, fields_ids_map, primary_key) = param; - let to_delete: Vec<_> = self.to_delete.into_iter().collect(); - Ok(to_delete.into_par_iter().try_map_try_init( - || index.read_txn().map_err(crate::Error::from), - |rtxn, docid| { - let current = index.document(rtxn, docid)?; - let external_document_id = primary_key - .document_id(current, fields_ids_map)? - .map_err(|_| InternalError::DatabaseMissingEntry { - db_name: EXTERNAL_DOCUMENTS_IDS, - key: None, - })?; - Ok(DocumentChange::Deletion(Deletion::create( - docid, - external_document_id, - current.boxed(), - ))) - }, - )) + indexer: &'indexer Bump, + primary_key: PrimaryKey<'indexer>, + ) -> DocumentDeletionChanges<'indexer> { + let to_delete: bumpalo::collections::Vec<_> = + self.to_delete.into_iter().collect_in(indexer); + + let to_delete = to_delete.into_bump_slice(); + + DocumentDeletionChanges { to_delete, primary_key } + } +} + +pub struct DocumentDeletionChanges<'indexer> { + to_delete: &'indexer [DocumentId], + primary_key: PrimaryKey<'indexer>, +} + +impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { + type Item = DocumentId; + + fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator { + self.to_delete.into_par_iter().copied() + } + + fn item_to_document_change< + 'doc, // lifetime of a single `process` call + T: MostlySend, + >( + &'doc self, + context: &'doc DocumentChangeContext, + docid: Self::Item, + ) -> Result> + where + 'pl: 'doc, // the payload must survive the process calls + { + let current = context.index.document(&context.txn, docid)?; + let new_fields_ids_map = context.new_fields_ids_map.borrow(); + let new_fields_ids_map = new_fields_ids_map.local_map(); + let external_document_id = + self.primary_key.document_id(current, new_fields_ids_map)?.map_err(|_| { + InternalError::DatabaseMissingEntry { db_name: EXTERNAL_DOCUMENTS_IDS, key: None } + })?; + Ok(DocumentChange::Deletion(Deletion::create(docid, external_document_id))) + } +} + +// TODO: implement Allocator for Ref<'bump, Bump> + +#[cfg(test)] +mod test { + use std::cell::RefCell; + use std::marker::PhantomData; + use std::sync::RwLock; + + use bumpalo::Bump; + use raw_collections::alloc::RefBump; + + use crate::index::tests::TempIndex; + use crate::update::new::indexer::document_changes::{ + for_each_document_change, DocumentChangeContext, Extractor, IndexingContext, MostlySend, + ThreadLocal, + }; + use crate::update::new::indexer::DocumentDeletion; + use crate::update::new::DocumentChange; + use crate::DocumentId; + + #[test] + fn test_deletions() { + struct DeletionWithData<'extractor> { + deleted: RefCell< + hashbrown::HashSet< + DocumentId, + hashbrown::hash_map::DefaultHashBuilder, + RefBump<'extractor>, + >, + >, + } + + unsafe impl<'extractor> MostlySend for DeletionWithData<'extractor> {} + + struct TrackDeletion<'extractor>(PhantomData<&'extractor ()>); + + impl<'extractor> Extractor<'extractor> for TrackDeletion<'extractor> { + type Data = DeletionWithData<'extractor>; + + fn init_data( + &self, + extractor_alloc: raw_collections::alloc::RefBump<'extractor>, + ) -> crate::Result { + let deleted = RefCell::new(hashbrown::HashSet::new_in(extractor_alloc)); + Ok(DeletionWithData { deleted }) + } + + fn process( + &self, + change: DocumentChange, + context: &DocumentChangeContext, + ) -> crate::Result<()> { + context.data.deleted.borrow_mut().insert(change.docid()); + Ok(()) + } + } + + let mut deletions = DocumentDeletion::new(); + deletions.delete_documents_by_docids(vec![0, 2, 42].into_iter().collect()); + let indexer = Bump::new(); + + let index = TempIndex::new(); + + let rtxn = index.read_txn().unwrap(); + + let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let fields_ids_map = RwLock::new(db_fields_ids_map.clone()); + + let fields_ids_map_store = ThreadLocal::new(); + + let mut extractor_allocs = ThreadLocal::new(); + let doc_allocs = ThreadLocal::new(); + + let deletion_tracker = TrackDeletion(PhantomData); + + let changes = deletions + .into_changes(&indexer, crate::documents::PrimaryKey::Flat { name: "id", field_id: 0 }); + + let context = IndexingContext { + index: &index, + db_fields_ids_map: &db_fields_ids_map, + new_fields_ids_map: &fields_ids_map, + doc_allocs: &doc_allocs, + fields_ids_map_store: &fields_ids_map_store, + }; + + for _ in 0..3 { + let datastore = ThreadLocal::new(); + + for_each_document_change( + &changes, + &deletion_tracker, + context, + &mut extractor_allocs, + &datastore, + ) + .unwrap(); + + for (index, data) in datastore.into_iter().enumerate() { + println!("deleted by {index}: {:?}", data.deleted.borrow()); + } + for alloc in extractor_allocs.iter_mut() { + let alloc = &mut alloc.0; + alloc.get_mut().reset(); + } + } + drop(deletion_tracker); + drop(changes); + drop(rtxn); } } diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 7341f4e5c..7978fc46c 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -1,19 +1,18 @@ -use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap}; -use std::sync::Arc; - -use heed::types::Bytes; +use bumpalo::collections::CollectIn; +use bumpalo::Bump; use heed::RoTxn; use memmap2::Mmap; -use rayon::iter::{IndexedParallelIterator, IntoParallelIterator}; +use rayon::iter::IntoParallelIterator; +use serde_json::value::RawValue; use IndexDocumentsMethod as Idm; use super::super::document_change::DocumentChange; -use super::super::{CowStr, TopLevelMap}; -use super::DocumentChanges; +use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; -use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; -use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update}; +use crate::update::new::document::DocumentFromVersions; +use crate::update::new::document_change::Versions; +use crate::update::new::indexer::de::DocumentVisitor; +use crate::update::new::{Deletion, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; @@ -22,9 +21,14 @@ pub struct DocumentOperation<'pl> { index_documents_method: IndexDocumentsMethod, } +pub struct DocumentOperationChanges<'pl> { + docids_version_offsets: &'pl [(&'pl str, ((u32, bool), &'pl [InnerDocOp<'pl>]))], + index_documents_method: IndexDocumentsMethod, +} + pub enum Payload<'pl> { Addition(&'pl [u8]), - Deletion(Vec), + Deletion(&'pl [&'pl str]), } pub struct PayloadStats { @@ -33,7 +37,7 @@ pub struct PayloadStats { } #[derive(Clone)] -enum InnerDocOp<'pl> { +pub enum InnerDocOp<'pl> { Addition(DocumentOffset<'pl>), Deletion, } @@ -61,83 +65,89 @@ impl<'pl> DocumentOperation<'pl> { Ok(PayloadStats { bytes: payload.len() as u64, document_count }) } - pub fn delete_documents(&mut self, to_delete: Vec) { + pub fn delete_documents(&mut self, to_delete: &'pl [&'pl str]) { self.operations.push(Payload::Deletion(to_delete)) } -} -impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { - type Parameter = (&'p Index, &'p RoTxn<'p>, &'p PrimaryKey<'p>); - - fn document_changes( + pub fn into_changes( self, - fields_ids_map: &mut FieldsIdsMap, - param: Self::Parameter, - ) -> Result< - impl IndexedParallelIterator>> - + Clone - + 'p, - > { - let (index, rtxn, primary_key) = param; + indexer: &'pl Bump, + index: &Index, + rtxn: &RoTxn, + primary_key: &PrimaryKey, + new_fields_ids_map: &mut FieldsIdsMap, + ) -> Result> { + use serde::de::Deserializer; + // will contain nodes from the intermediate hashmap + let document_changes_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1 MiB let documents_ids = index.documents_ids(rtxn)?; let mut available_docids = AvailableIds::new(&documents_ids); - let mut docids_version_offsets = HashMap::, _>::new(); + let mut docids_version_offsets = + hashbrown::HashMap::<&'pl str, _, _, _>::new_in(&document_changes_alloc); for operation in self.operations { match operation { Payload::Addition(payload) => { let mut iter = - serde_json::Deserializer::from_slice(payload).into_iter::(); + serde_json::Deserializer::from_slice(payload).into_iter::<&RawValue>(); /// TODO manage the error let mut previous_offset = 0; - while let Some(document) = iter.next().transpose().unwrap() { - // TODO Fetch all document fields to fill the fields ids map - document.0.keys().for_each(|key| { - fields_ids_map.insert(key.as_ref()); - }); + while let Some(document) = + iter.next().transpose().map_err(UserError::SerdeJson)? + { + let res = document + .deserialize_map(DocumentVisitor::new( + new_fields_ids_map, + primary_key, + indexer, + )) + .map_err(UserError::SerdeJson)?; - // TODO we must manage the TooManyDocumentIds,InvalidDocumentId - // we must manage the unwrap - let external_document_id = - match primary_key.document_id_from_top_level_map(&document)? { - Ok(document_id) => Ok(document_id), - Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e), - Err(DocumentIdExtractionError::MissingDocumentId) => { - Err(UserError::MissingDocumentId { - primary_key: primary_key.name().to_string(), - document: document.try_into().unwrap(), - }) - } - Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { - Err(UserError::TooManyDocumentIds { - primary_key: primary_key.name().to_string(), - document: document.try_into().unwrap(), - }) - } - }?; + let external_document_id = match res { + Ok(document_id) => Ok(document_id), + Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e), + Err(DocumentIdExtractionError::MissingDocumentId) => { + Err(UserError::MissingDocumentId { + primary_key: primary_key.name().to_string(), + document: serde_json::from_str(document.get()).unwrap(), + }) + } + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: serde_json::from_str(document.get()).unwrap(), + }) + } + }?; let current_offset = iter.byte_offset(); let document_operation = InnerDocOp::Addition(DocumentOffset { content: &payload[previous_offset..current_offset], }); - match docids_version_offsets.get_mut(external_document_id.as_ref()) { + match docids_version_offsets.get_mut(external_document_id) { None => { - let docid = match index + let (docid, is_new) = match index .external_documents_ids() .get(rtxn, &external_document_id)? { - Some(docid) => docid, - None => available_docids - .next() - .ok_or(Error::UserError(UserError::DocumentLimitReached))?, + Some(docid) => (docid, false), + None => ( + available_docids.next().ok_or(Error::UserError( + UserError::DocumentLimitReached, + ))?, + true, + ), }; docids_version_offsets.insert( external_document_id, - (docid, vec![document_operation]), + ( + (docid, is_new), + bumpalo::vec![in indexer; document_operation], + ), ); } Some((_, offsets)) => { @@ -163,21 +173,27 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { } Payload::Deletion(to_delete) => { for external_document_id in to_delete { - match docids_version_offsets.get_mut(external_document_id.as_str()) { + match docids_version_offsets.get_mut(external_document_id) { None => { - let docid = match index + let (docid, is_new) = match index .external_documents_ids() - .get(rtxn, &external_document_id)? + .get(rtxn, external_document_id)? { - Some(docid) => docid, - None => available_docids - .next() - .ok_or(Error::UserError(UserError::DocumentLimitReached))?, + Some(docid) => (docid, false), + None => ( + available_docids.next().ok_or(Error::UserError( + UserError::DocumentLimitReached, + ))?, + true, + ), }; docids_version_offsets.insert( - CowStr(external_document_id.into()), - (docid, vec![InnerDocOp::Deletion]), + external_document_id, + ( + (docid, is_new), + bumpalo::vec![in indexer; InnerDocOp::Deletion], + ), ); } Some((_, offsets)) => { @@ -190,10 +206,11 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { } } - /// TODO is it the best way to provide FieldsIdsMap to the parallel iterator? - let fields_ids_map = fields_ids_map.clone(); // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone - let mut docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect(); + let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> = docids_version_offsets + .drain() + .map(|(item, (docid, v))| (item, (docid, v.into_bump_slice()))) + .collect_in(indexer); // Reorder the offsets to make sure we iterate on the file sequentially let sort_function_key = match self.index_documents_method { Idm::ReplaceDocuments => MergeDocumentForReplacement::sort_key, @@ -202,43 +219,61 @@ impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> { // And finally sort them docids_version_offsets.sort_unstable_by_key(|(_, (_, docops))| sort_function_key(docops)); + let docids_version_offsets = docids_version_offsets.into_bump_slice(); + Ok(DocumentOperationChanges { + docids_version_offsets, + index_documents_method: self.index_documents_method, + }) + } +} - Ok(docids_version_offsets.into_par_iter().try_map_try_init( - || index.read_txn().map_err(Error::from), - move |rtxn, (external_docid, (internal_docid, operations))| { - let document_merge_function = match self.index_documents_method { - Idm::ReplaceDocuments => MergeDocumentForReplacement::merge, - Idm::UpdateDocuments => MergeDocumentForUpdates::merge, - }; +impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> { + type Item = &'pl (&'pl str, ((u32, bool), &'pl [InnerDocOp<'pl>])); - document_merge_function( - rtxn, - index, - &fields_ids_map, - internal_docid, - external_docid.to_string(), // TODO do not clone - &operations, - ) - }, - )) + fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator { + self.docids_version_offsets.into_par_iter() + } + + fn item_to_document_change<'doc, T: MostlySend + 'doc>( + &'doc self, + context: &'doc DocumentChangeContext, + item: Self::Item, + ) -> Result> + where + 'pl: 'doc, + { + let document_merge_function = match self.index_documents_method { + Idm::ReplaceDocuments => MergeDocumentForReplacement::merge, + Idm::UpdateDocuments => MergeDocumentForUpdates::merge, + }; + + let (external_doc, ((internal_docid, is_new), operations)) = *item; + + let change = document_merge_function( + internal_docid, + external_doc, + is_new, + &context.doc_alloc, + operations, + )?; + Ok(change) } } trait MergeChanges { - /// Wether the payloads in the list of operations are useless or not. + /// Whether the payloads in the list of operations are useless or not. const USELESS_PREVIOUS_CHANGES: bool; /// Returns a key that is used to order the payloads the right way. fn sort_key(docops: &[InnerDocOp]) -> usize; - fn merge( - rtxn: &RoTxn, - index: &Index, - fields_ids_map: &FieldsIdsMap, + fn merge<'doc>( docid: DocumentId, - external_docid: String, - operations: &[InnerDocOp], - ) -> Result; + external_docid: &'doc str, + is_new: bool, + doc_alloc: &'doc Bump, + operations: &'doc [InnerDocOp], + ) -> Result>; } struct MergeDocumentForReplacement; @@ -258,48 +293,42 @@ impl MergeChanges for MergeDocumentForReplacement { /// Returns only the most recent version of a document based on the updates from the payloads. /// /// This function is only meant to be used when doing a replacement and not an update. - fn merge( - rtxn: &RoTxn, - index: &Index, - fields_ids_map: &FieldsIdsMap, + fn merge<'doc>( docid: DocumentId, - external_docid: String, - operations: &[InnerDocOp], - ) -> Result { - let current = index.documents.remap_data_type::().get(rtxn, &docid)?; - let current: Option<&KvReaderFieldId> = current.map(Into::into); - + external_doc: &'doc str, + is_new: bool, + doc_alloc: &'doc Bump, + operations: &'doc [InnerDocOp], + ) -> Result> { match operations.last() { Some(InnerDocOp::Addition(DocumentOffset { content })) => { - let map: TopLevelMap = serde_json::from_slice(content).unwrap(); - let mut document_entries = Vec::new(); - for (key, v) in map.0 { - let id = fields_ids_map.id(key.as_ref()).unwrap(); - document_entries.push((id, v)); - } + let document = serde_json::from_slice(content).unwrap(); + let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) + .map_err(UserError::SerdeJson)?; - document_entries.sort_unstable_by_key(|(id, _)| *id); + let document = document.into_bump_slice(); + let document = DocumentFromVersions::new(Versions::Single(document)); - let mut writer = KvWriterFieldId::memory(); - document_entries - .into_iter() - .for_each(|(id, value)| writer.insert(id, value.get()).unwrap()); - let new = writer.into_boxed(); - - match current { - Some(current) => { - let update = Update::create(docid, external_docid, current.boxed(), new); - Ok(DocumentChange::Update(update)) - } - None => { - Ok(DocumentChange::Insertion(Insertion::create(docid, external_docid, new))) - } + if is_new { + Ok(DocumentChange::Insertion(Insertion::create( + docid, + external_doc.to_owned(), + document, + ))) + } else { + Ok(DocumentChange::Update(Update::create( + docid, + external_doc.to_owned(), + document, + true, + ))) } } Some(InnerDocOp::Deletion) => { - let deletion = match current { - Some(current) => Deletion::create(docid, external_docid, current.boxed()), - None => todo!("Do that with Louis"), + let deletion = if is_new { + Deletion::create(docid, external_doc.to_owned()) + } else { + todo!("Do that with Louis") }; Ok(DocumentChange::Deletion(deletion)) } @@ -326,18 +355,13 @@ impl MergeChanges for MergeDocumentForUpdates { /// in the grenad update files and merges them to generate a new boxed obkv. /// /// This function is only meant to be used when doing an update and not a replacement. - fn merge( - rtxn: &RoTxn, - index: &Index, - fields_ids_map: &FieldsIdsMap, + fn merge<'doc>( docid: DocumentId, - external_docid: String, - operations: &[InnerDocOp], - ) -> Result { - let mut document = BTreeMap::<_, Cow<_>>::new(); - let current = index.documents.remap_data_type::().get(rtxn, &docid)?; - let current: Option<&KvReaderFieldId> = current.map(Into::into); - + external_docid: &'doc str, + is_new: bool, + doc_alloc: &'doc Bump, + operations: &'doc [InnerDocOp], + ) -> Result> { if operations.is_empty() { unreachable!("We must not have empty set of operations on a document"); } @@ -345,24 +369,20 @@ impl MergeChanges for MergeDocumentForUpdates { let last_deletion = operations.iter().rposition(|op| matches!(op, InnerDocOp::Deletion)); let operations = &operations[last_deletion.map_or(0, |i| i + 1)..]; - // If there was a deletion we must not start - // from the original document but from scratch. - if last_deletion.is_none() { - if let Some(current) = current { - current.into_iter().for_each(|(k, v)| { - document.insert(k, v.into()); - }); - } - } + let has_deletion = last_deletion.is_some(); if operations.is_empty() { - let deletion = match current { - Some(current) => Deletion::create(docid, external_docid, current.boxed()), - None => todo!("Do that with Louis"), + let deletion = if !is_new { + Deletion::create(docid, external_docid.to_owned()) + } else { + todo!("Do that with Louis") }; + return Ok(DocumentChange::Deletion(deletion)); } + let mut versions = bumpalo::collections::Vec::with_capacity_in(operations.len(), doc_alloc); + for operation in operations { let DocumentOffset { content } = match operation { InnerDocOp::Addition(offset) => offset, @@ -371,26 +391,35 @@ impl MergeChanges for MergeDocumentForUpdates { } }; - let map: TopLevelMap = serde_json::from_slice(content).unwrap(); - for (key, v) in map.0 { - let id = fields_ids_map.id(key.as_ref()).unwrap(); - document.insert(id, v.get().as_bytes().to_vec().into()); - } + let document = serde_json::from_slice(content).unwrap(); + let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) + .map_err(UserError::SerdeJson)?; + + let document = document.into_bump_slice(); + versions.push(document); } - let mut writer = KvWriterFieldId::memory(); - document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap()); - let new = writer.into_boxed(); + let versions = versions.into_bump_slice(); + let versions = match versions { + [single] => Versions::Single(*single), + versions => Versions::Multiple(versions), + }; - match current { - Some(current) => { - let update = Update::create(docid, external_docid, current.boxed(), new); - Ok(DocumentChange::Update(update)) - } - None => { - let insertion = Insertion::create(docid, external_docid, new); - Ok(DocumentChange::Insertion(insertion)) - } + let document = DocumentFromVersions::new(versions); + + if is_new { + Ok(DocumentChange::Insertion(Insertion::create( + docid, + external_docid.to_owned(), + document, + ))) + } else { + Ok(DocumentChange::Update(Update::create( + docid, + external_docid.to_owned(), + document, + has_deletion, + ))) } } } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index f231527f6..673cd402e 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -1,7 +1,12 @@ +use std::cell::RefCell; use std::sync::{Arc, RwLock}; use std::thread::{self, Builder}; use big_s::S; +use bumpalo::Bump; +use document_changes::{ + for_each_document_change, DocumentChanges, Extractor, FullySend, IndexingContext, ThreadLocal, +}; pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; use heed::{RoTxn, RwTxn}; @@ -11,6 +16,7 @@ use rayon::ThreadPool; pub use update_by_function::UpdateByFunction; use super::channel::*; +use super::document::write_to_obkv; use super::document_change::{Deletion, DocumentChange, Insertion, Update}; use super::extract::*; use super::merger::{merge_grenad_entries, FacetFieldIdsDelta}; @@ -18,32 +24,75 @@ use super::word_fst_builder::PrefixDelta; use super::words_prefix_docids::{ compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, }; -use super::{StdResult, TopLevelMap}; +use super::{extract, StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::facet::FacetType; use crate::update::new::channel::ExtractorSender; -use crate::update::settings::InnerIndexSettings; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; -use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; +use crate::{fields_ids_map, Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +mod de; +pub mod document_changes; mod document_deletion; mod document_operation; mod partial_dump; mod update_by_function; -pub trait DocumentChanges<'p> { - type Parameter: 'p; +struct DocumentExtractor<'a> { + document_sender: &'a DocumentSender<'a>, +} - fn document_changes( - self, - fields_ids_map: &mut FieldsIdsMap, - param: Self::Parameter, - ) -> Result< - impl IndexedParallelIterator>> - + Clone - + 'p, - >; +impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { + type Data = FullySend<()>; + + fn init_data( + &self, + extractor_alloc: raw_collections::alloc::RefBump<'extractor>, + ) -> Result { + Ok(FullySend(())) + } + + fn process( + &self, + change: DocumentChange, + context: &document_changes::DocumentChangeContext, + ) -> Result<()> { + let mut document_buffer = Vec::new(); + + let new_fields_ids_map = context.new_fields_ids_map.borrow(); + let new_fields_ids_map = &*new_fields_ids_map; + let new_fields_ids_map = new_fields_ids_map.local_map(); + + let external_docid = change.external_docid().to_owned(); + + // document but we need to create a function that collects and compresses documents. + match change { + DocumentChange::Deletion(deletion) => { + let docid = deletion.docid(); + self.document_sender.delete(docid, external_docid).unwrap(); + } + /// TODO: change NONE by SOME(vector) when implemented + DocumentChange::Update(update) => { + let docid = update.docid(); + let content = + update.new(&context.txn, context.index, &context.db_fields_ids_map)?; + let content = + write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; + self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); + } + DocumentChange::Insertion(insertion) => { + let docid = insertion.docid(); + let content = insertion.new(); + let content = + write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; + self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); + // extracted_dictionary_sender.send(self, dictionary: &[u8]); + } + } + Ok(()) + } } /// This is the main function of this crate. @@ -51,25 +100,34 @@ pub trait DocumentChanges<'p> { /// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. /// /// TODO return stats -pub fn index( +pub fn index<'pl, 'indexer, 'index, DC>( wtxn: &mut RwTxn, - index: &Index, - fields_ids_map: FieldsIdsMap, + index: &'index Index, + db_fields_ids_map: &'indexer FieldsIdsMap, + new_fields_ids_map: FieldsIdsMap, pool: &ThreadPool, - document_changes: PI, + document_changes: &DC, ) -> Result<()> where - PI: IndexedParallelIterator>> - + Send - + Clone, + DC: DocumentChanges<'pl>, { let (merger_sender, writer_receiver) = merger_writer_channel(10_000); // This channel acts as a rendezvous point to ensure that we are one task ahead let (extractor_sender, merger_receiver) = extractors_merger_channels(4); - let fields_ids_map_lock = RwLock::new(fields_ids_map); - let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); - let global_fields_ids_map_clone = global_fields_ids_map.clone(); + let new_fields_ids_map = RwLock::new(new_fields_ids_map); + + let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads()); + let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); + let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); + + let indexing_context = IndexingContext { + index, + db_fields_ids_map, + new_fields_ids_map: &new_fields_ids_map, + doc_allocs: &doc_allocs, + fields_ids_map_store: &fields_ids_map_store, + }; thread::scope(|s| { let indexer_span = tracing::Span::current(); @@ -78,26 +136,12 @@ where pool.in_place_scope(|_s| { let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); let _entered = span.enter(); - let document_changes = document_changes.into_par_iter(); // document but we need to create a function that collects and compresses documents. let document_sender = extractor_sender.document_sender(); - document_changes.clone().into_par_iter().try_arc_for_each::<_, Error>( - |result| { - match result? { - DocumentChange::Deletion(Deletion { docid, external_document_id, ..}) => { - document_sender.delete(docid, external_document_id).unwrap(); - } - DocumentChange::Update(Update { docid, external_document_id, new, ..}) => { - document_sender.insert(docid, external_document_id, new).unwrap(); - } - DocumentChange::Insertion(Insertion { docid, external_document_id, new, ..}) => { - document_sender.insert(docid, external_document_id, new).unwrap(); - // extracted_dictionary_sender.send(self, dictionary: &[u8]); - } - } - Ok(()) - })?; + let document_extractor = DocumentExtractor { document_sender: &document_sender}; + let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); + for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?; document_sender.finish().unwrap(); @@ -112,13 +156,14 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); let _entered = span.enter(); extract_and_send_docids::< + _, FacetedDocidsExtractor, FacetDocids, >( - index, - &global_fields_ids_map, grenad_parameters, - document_changes.clone(), + document_changes, + indexing_context, + &mut extractor_allocs, &extractor_sender, )?; } @@ -133,7 +178,7 @@ where exact_word_docids, word_position_docids, fid_word_count_docids, - } = WordDocidsExtractors::run_extraction(index, &global_fields_ids_map, grenad_parameters, document_changes.clone())?; + } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?; extractor_sender.send_searchable::(word_docids).unwrap(); extractor_sender.send_searchable::(word_fid_docids).unwrap(); extractor_sender.send_searchable::(exact_word_docids).unwrap(); @@ -145,13 +190,14 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); let _entered = span.enter(); extract_and_send_docids::< + _, WordPairProximityDocidsExtractor, WordPairProximityDocids, >( - index, - &global_fields_ids_map, grenad_parameters, - document_changes.clone(), + document_changes, + indexing_context, + &mut extractor_allocs, &extractor_sender, )?; } @@ -180,6 +226,8 @@ where }) })?; + let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); + let indexer_span = tracing::Span::current(); // TODO manage the errors correctly let merger_thread = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || { @@ -192,7 +240,7 @@ where merger_sender, &rtxn, index, - global_fields_ids_map_clone, + global_fields_ids_map, ) })?; @@ -223,7 +271,10 @@ where Ok(()) as Result<_> })?; - let fields_ids_map = fields_ids_map_lock.into_inner().unwrap(); + drop(indexing_context); + drop(fields_ids_map_store); + + let fields_ids_map = new_fields_ids_map.into_inner().unwrap(); index.put_fields_ids_map(wtxn, &fields_ids_map)?; // used to update the localized and weighted maps while sharing the update code with the settings pipeline. @@ -284,14 +335,23 @@ fn compute_facet_level_database( /// TODO: GrenadParameters::default() should be removed in favor a passed parameter /// TODO: manage the errors correctly /// TODO: we must have a single trait that also gives the extractor type -fn extract_and_send_docids( - index: &Index, - fields_ids_map: &GlobalFieldsIdsMap, - indexer: GrenadParameters, - document_changes: impl IntoParallelIterator>>, +fn extract_and_send_docids< + 'pl, + 'fid, + 'indexer, + 'index, + DC: DocumentChanges<'pl>, + E: DocidsExtractor, + D: MergerOperationType, +>( + grenad_parameters: GrenadParameters, + document_changes: &DC, + indexing_context: IndexingContext<'fid, 'indexer, 'index>, + extractor_allocs: &mut ThreadLocal>>, sender: &ExtractorSender, ) -> Result<()> { - let merger = E::run_extraction(index, fields_ids_map, indexer, document_changes)?; + let merger = + E::run_extraction(grenad_parameters, document_changes, indexing_context, extractor_allocs)?; sender.send_searchable::(merger).unwrap(); Ok(()) } diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 08b97b931..527f5c751 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -1,13 +1,17 @@ -use std::sync::Arc; +use std::ops::DerefMut; use rayon::iter::IndexedParallelIterator; +use serde::Deserializer; +use serde_json::value::RawValue; -use super::DocumentChanges; +use super::de::DocumentVisitor; +use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; -use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; -use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId}; -use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; +use crate::update::new::document::DocumentFromVersions; +use crate::update::new::document_change::Versions; +use crate::update::new::{DocumentChange, Insertion}; +use crate::{Error, InternalError, Result, UserError}; pub struct PartialDump { iter: I, @@ -17,69 +21,81 @@ impl PartialDump { pub fn new_from_jsonlines(iter: I) -> Self { PartialDump { iter } } -} -impl<'p, I> DocumentChanges<'p> for PartialDump -where - I: IndexedParallelIterator + Clone + 'p, -{ - type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>); - - /// Note for future self: - /// - the field ids map must already be valid so you must have to generate it beforehand. - /// - We should probably expose another method that generates the fields ids map from an iterator of JSON objects. - /// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items). - fn document_changes( + pub fn into_changes<'index>( self, - _fields_ids_map: &mut FieldsIdsMap, - param: Self::Parameter, - ) -> Result< - impl IndexedParallelIterator>> - + Clone - + 'p, - > { - let (fields_ids_map, concurrent_available_ids, primary_key) = param; - - Ok(self.iter.try_map_try_init( - || Ok(()), - |_, object| { - let docid = match concurrent_available_ids.next() { - Some(id) => id, - None => return Err(Error::UserError(UserError::DocumentLimitReached)), - }; - - let mut writer = KvWriterFieldId::memory(); - object.iter().for_each(|(key, value)| { - let key = fields_ids_map.id(key).unwrap(); - /// TODO better error management - let value = serde_json::to_vec(&value).unwrap(); - /// TODO it is not ordered - writer.insert(key, value).unwrap(); - }); - - let document = writer.into_boxed(); - let external_docid = match primary_key.document_id(&document, fields_ids_map)? { - Ok(document_id) => Ok(document_id), - Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => { - Err(user_error) - } - Err(DocumentIdExtractionError::MissingDocumentId) => { - Err(UserError::MissingDocumentId { - primary_key: primary_key.name().to_string(), - document: all_obkv_to_json(&document, fields_ids_map)?, - }) - } - Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { - Err(UserError::TooManyDocumentIds { - primary_key: primary_key.name().to_string(), - document: all_obkv_to_json(&document, fields_ids_map)?, - }) - } - }?; - - let insertion = Insertion::create(docid, external_docid, document); - Ok(DocumentChange::Insertion(insertion)) - }, - )) + concurrent_available_ids: &'index ConcurrentAvailableIds, + primary_key: &'index PrimaryKey, + ) -> PartialDumpChanges<'index, I> { + /// Note for future self: + /// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items). + PartialDumpChanges { iter: self.iter, concurrent_available_ids, primary_key } + } +} + +pub struct PartialDumpChanges<'doc, I> { + iter: I, + concurrent_available_ids: &'doc ConcurrentAvailableIds, + primary_key: &'doc PrimaryKey<'doc>, +} + +impl<'index, Iter> DocumentChanges<'index> for PartialDumpChanges<'index, Iter> +where + Iter: IndexedParallelIterator> + Clone + Sync + 'index, +{ + type Item = Box; + + fn iter(&self) -> impl IndexedParallelIterator { + self.iter.clone() + } + + fn item_to_document_change<'doc, T: MostlySend + 'doc>( + &'doc self, + context: &'doc DocumentChangeContext, + document: Self::Item, + ) -> Result> + where + 'index: 'doc, + { + let doc_alloc = &context.doc_alloc; + let docid = match self.concurrent_available_ids.next() { + Some(id) => id, + None => return Err(Error::UserError(UserError::DocumentLimitReached)), + }; + + let mut fields_ids_map = context.new_fields_ids_map.borrow_mut(); + let fields_ids_map = fields_ids_map.deref_mut(); + + let res = document + .deserialize_map(DocumentVisitor::new(fields_ids_map, self.primary_key, &doc_alloc)) + .map_err(UserError::SerdeJson)?; + + let external_document_id = match res { + Ok(document_id) => Ok(document_id), + Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e), + Err(DocumentIdExtractionError::MissingDocumentId) => { + Err(UserError::MissingDocumentId { + primary_key: self.primary_key.name().to_string(), + document: serde_json::from_str(document.get()).unwrap(), + }) + } + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: self.primary_key.name().to_string(), + document: serde_json::from_str(document.get()).unwrap(), + }) + } + }?; + let document = doc_alloc.alloc_str(document.get()); + let document: &RawValue = unsafe { std::mem::transmute(document) }; + + let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) + .map_err(InternalError::SerdeJson)?; + + let document = document.into_bump_slice(); + let document = DocumentFromVersions::new(Versions::Single(document)); + + let insertion = Insertion::create(docid, external_document_id.to_owned(), document); + Ok(DocumentChange::Insertion(insertion)) } } diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index d6d532433..9bff15b5c 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -1,25 +1,33 @@ -use std::sync::Arc; +use rayon::iter::IntoParallelIterator; -use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; - -use super::DocumentChanges; -use crate::update::new::DocumentChange; -use crate::{Error, FieldsIdsMap, Result}; +use super::document_changes::{DocumentChangeContext, DocumentChanges}; +use crate::Result; pub struct UpdateByFunction; -impl<'p> DocumentChanges<'p> for UpdateByFunction { - type Parameter = (); - - fn document_changes( - self, - _fields_ids_map: &mut FieldsIdsMap, - _param: Self::Parameter, - ) -> Result< - impl IndexedParallelIterator>> - + Clone - + 'p, - > { - Ok((0..100).into_par_iter().map(|_| todo!())) +impl UpdateByFunction { + pub fn into_changes(self) -> UpdateByFunctionChanges { + UpdateByFunctionChanges + } +} + +pub struct UpdateByFunctionChanges; + +impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges { + type Item = u32; + + fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator { + (0..100).into_par_iter() + } + + fn item_to_document_change<'doc, T: super::document_changes::MostlySend + 'doc>( + &self, + _context: &'doc DocumentChangeContext, + _item: Self::Item, + ) -> Result> + where + 'index: 'doc, + { + todo!() } } diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 9751be66c..524608801 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -3,10 +3,10 @@ use std::io::{self}; use bincode::ErrorKind; use grenad::Merger; +use hashbrown::HashSet; use heed::types::Bytes; use heed::{Database, RoTxn}; use roaring::RoaringBitmap; -use std::collections::HashSet; use super::channel::*; use super::extract::FacetKind; @@ -149,17 +149,8 @@ pub fn merge_grenad_entries( let current = index.documents.remap_data_type::().get(rtxn, &docid)?; let current: Option<&KvReaderFieldId> = current.map(Into::into); let change = match current { - Some(current) => DocumentChange::Update(Update::create( - docid, - external_id, - current.boxed(), - document, - )), - None => DocumentChange::Insertion(Insertion::create( - docid, - external_id, - document, - )), + Some(current) => DocumentChange::Update(todo!()), + None => DocumentChange::Insertion(todo!()), }; geo_extractor.manage_change(&mut global_fields_ids_map, &change)?; } @@ -174,12 +165,7 @@ pub fn merge_grenad_entries( sender.documents().delete(docid, external_id.clone()).unwrap(); if let Some(geo_extractor) = geo_extractor.as_mut() { - let current = index.document(rtxn, docid)?; - let change = DocumentChange::Deletion(Deletion::create( - docid, - external_id, - current.boxed(), - )); + let change = DocumentChange::Deletion(Deletion::create(docid, todo!())); geo_extractor.manage_change(&mut global_fields_ids_map, &change)?; } } diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 4a83529dc..37ccc75cd 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -5,6 +5,7 @@ use super::del_add::DelAdd; use crate::FieldId; mod channel; +pub mod document; mod document_change; mod extract; pub mod indexer; From 6028d6ba437c0c2bec106afd36732e7a8ef4ef2c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 10 Oct 2024 22:42:37 +0200 Subject: [PATCH 129/247] Remove somme warnings --- index-scheduler/src/batch.rs | 3 +-- milli/src/update/new/document.rs | 10 +--------- .../new/extract/faceted/extract_facets.rs | 18 +++++++----------- milli/src/update/new/extract/faceted/mod.rs | 2 +- .../extract/searchable/extract_word_docids.rs | 10 +--------- .../extract_word_pair_proximity_docids.rs | 2 -- milli/src/update/new/extract/searchable/mod.rs | 5 ++--- .../extract/searchable/tokenize_document.rs | 3 +-- milli/src/update/new/indexer/de.rs | 6 +++--- .../src/update/new/indexer/document_changes.rs | 18 +++--------------- .../update/new/indexer/document_deletion.rs | 5 +---- milli/src/update/new/indexer/mod.rs | 14 ++++++-------- milli/src/update/new/merger.rs | 10 ++++------ 13 files changed, 31 insertions(+), 75 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 446efd0c4..2bd20b6e8 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -30,7 +30,6 @@ use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::heed::CompactionOption; -use meilisearch_types::milli::update::new::indexer::document_changes::DocumentChanges; use meilisearch_types::milli::update::new::indexer::{self, retrieve_or_guess_primary_key}; use meilisearch_types::milli::update::{ IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, @@ -1252,7 +1251,7 @@ impl IndexScheduler { mut tasks, } => { let started_processing_at = std::time::Instant::now(); - let mut primary_key_has_been_set = false; + let primary_key_has_been_set = false; let must_stop_processing = self.must_stop_processing.clone(); let indexer_config = self.index_mapper.indexer_config(); // TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches. diff --git a/milli/src/update/new/document.rs b/milli/src/update/new/document.rs index 96d0e9cca..335e2c327 100644 --- a/milli/src/update/new/document.rs +++ b/milli/src/update/new/document.rs @@ -7,7 +7,7 @@ use super::document_change::{Entry, Versions}; use super::{KvReaderFieldId, KvWriterFieldId}; use crate::documents::FieldIdMapper; use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; -use crate::{DocumentId, FieldId, Index, InternalError, Result}; +use crate::{DocumentId, Index, InternalError, Result}; /// A view into a document that can represent either the current version from the DB, /// the update data from payload or other means, or the merged updated version. @@ -66,14 +66,6 @@ impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> { reader.map(|reader| Self { fields_ids_map: db_fields_ids_map, content: reader }) }) } - - fn field_from_fid(&self, fid: FieldId) -> Result> { - Ok(self - .content - .get(fid) - .map(|v| serde_json::from_slice(v).map_err(InternalError::SerdeJson)) - .transpose()?) - } } #[derive(Clone, Copy)] diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index a3f05ce0e..6ae1b3124 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -55,11 +55,7 @@ impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> { change: DocumentChange, context: &crate::update::new::indexer::document_changes::DocumentChangeContext, ) -> Result<()> { - FacetedDocidsExtractor::extract_document_change( - &context, - self.attributes_to_extract, - change, - ) + FacetedDocidsExtractor::extract_document_change(context, self.attributes_to_extract, change) } } @@ -162,7 +158,7 @@ impl FacetedDocidsExtractor { // key: fid buffer.push(FacetKind::Exists as u8); buffer.extend_from_slice(&fid.to_be_bytes()); - cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into)?; + cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)?; match value { // Number @@ -178,7 +174,7 @@ impl FacetedDocidsExtractor { buffer.extend_from_slice(&ordered); buffer.extend_from_slice(&n.to_be_bytes()); - cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) + cache_fn(cached_sorter, &buffer, docid).map_err(Into::into) } else { Ok(()) } @@ -192,7 +188,7 @@ impl FacetedDocidsExtractor { buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(0); // level 0 buffer.extend_from_slice(truncated.as_bytes()); - cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) + cache_fn(cached_sorter, &buffer, docid).map_err(Into::into) } // Null // key: fid @@ -200,7 +196,7 @@ impl FacetedDocidsExtractor { buffer.clear(); buffer.push(FacetKind::Null as u8); buffer.extend_from_slice(&fid.to_be_bytes()); - cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) + cache_fn(cached_sorter, &buffer, docid).map_err(Into::into) } // Empty // key: fid @@ -208,13 +204,13 @@ impl FacetedDocidsExtractor { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); - cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) + cache_fn(cached_sorter, &buffer, docid).map_err(Into::into) } Value::Object(o) if o.is_empty() => { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); - cache_fn(cached_sorter, &*buffer, docid).map_err(Into::into) + cache_fn(cached_sorter, &buffer, docid).map_err(Into::into) } // Otherwise, do nothing /// TODO: What about Value::Bool? diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index 65e90cdf4..bfe8efd03 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -27,7 +27,7 @@ impl From for FacetKind { } impl FacetKind { - pub fn extract_from_key<'k>(key: &'k [u8]) -> (FacetKind, &'k [u8]) { + pub fn extract_from_key(key: &[u8]) -> (FacetKind, &[u8]) { debug_assert!(key.len() > 3); (FacetKind::from(key[0]), &key[1..]) } diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 82bb0ec86..fd74cc8ce 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -3,29 +3,21 @@ use std::collections::HashMap; use std::fs::File; use std::num::NonZero; use std::ops::DerefMut as _; -use std::sync::Arc; use bumpalo::Bump; use grenad::{Merger, MergerBuilder}; use heed::RoTxn; -use rayon::iter::IntoParallelIterator; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use super::SearchableExtractor; -use crate::update::new::document::Document; use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, ThreadLocal, }; -use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{ - bucketed_position, DocumentId, Error, FieldId, GlobalFieldsIdsMap, Index, Result, - MAX_POSITION_PER_ATTRIBUTE, -}; +use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; const MAX_COUNTED_WORDS: usize = 30; diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index d47ab606c..86ede5b14 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -2,9 +2,7 @@ use std::cell::RefCell; use std::collections::VecDeque; use std::rc::Rc; -use bumpalo::Bump; use heed::RoTxn; -use obkv::KvReader; use super::tokenize_document::DocumentTokenizer; use super::SearchableExtractor; diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 758b3b6a1..1edeec8b4 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -5,7 +5,6 @@ mod tokenize_document; use std::cell::RefCell; use std::fs::File; use std::marker::PhantomData; -use std::ops::DerefMut; use bumpalo::Bump; pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers}; @@ -23,7 +22,7 @@ use crate::update::new::indexer::document_changes::{ }; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE}; pub struct SearchableExtractorData<'extractor, EX: SearchableExtractor> { tokenizer: &'extractor DocumentTokenizer<'extractor>, @@ -120,7 +119,7 @@ pub trait SearchableExtractor: Sized + Sync { indexing_context, extractor_allocs, &datastore, - ); + )?; } { let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index 71585c8d2..b8fd24f1b 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -8,7 +8,6 @@ use crate::update::new::document::Document; use crate::update::new::extract::perm_json_p::{ seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, }; -use crate::update::new::KvReaderFieldId; use crate::{ FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError, MAX_WORD_LENGTH, @@ -172,7 +171,7 @@ mod test { use bumpalo::Bump; use charabia::TokenizerBuilder; use meili_snap::snapshot; - use obkv::KvReader; + use raw_collections::RawMap; use serde_json::json; use serde_json::value::RawValue; diff --git a/milli/src/update/new/indexer/de.rs b/milli/src/update/new/indexer/de.rs index 749588c86..7976433b9 100644 --- a/milli/src/update/new/indexer/de.rs +++ b/milli/src/update/new/indexer/de.rs @@ -45,7 +45,7 @@ impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> let fid = fid.unwrap(); match self.primary_key { - PrimaryKey::Flat { name, field_id } => { + PrimaryKey::Flat { name: _, field_id } => { let value: &'de RawValue = map.next_value()?; if fid == *field_id { let value = match value @@ -145,8 +145,8 @@ impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> { use std::fmt::Write as _; - let mut out = bumpalo::collections::String::new_in(&self.0); - write!(&mut out, "{v}"); + let mut out = bumpalo::collections::String::new_in(self.0); + write!(&mut out, "{v}").unwrap(); Ok(Ok(out.into_bump_str())) } diff --git a/milli/src/update/new/indexer/document_changes.rs b/milli/src/update/new/indexer/document_changes.rs index 8bab9903f..1dd0832f5 100644 --- a/milli/src/update/new/indexer/document_changes.rs +++ b/milli/src/update/new/indexer/document_changes.rs @@ -76,20 +76,6 @@ impl MostlySendWrapper { Self(t) } - fn new_send(t: T) -> Self - where - T: Send, - { - Self(t) - } - - fn get(&self) -> T - where - T: Copy, - { - self.0 - } - fn as_ref(&self) -> &T { &self.0 } @@ -111,6 +97,7 @@ impl MostlySendWrapper { unsafe impl Send for MostlySendWrapper {} /// A wrapper around [`thread_local::ThreadLocal`] that accepts [`MostlySend`] `T`s. +#[derive(Default)] pub struct ThreadLocal { inner: thread_local::ThreadLocal>, // FIXME: this should be necessary @@ -235,6 +222,7 @@ impl< T: MostlySend, > DocumentChangeContext<'doc, 'extractor, 'fid, 'indexer, T> { + #[allow(clippy::too_many_arguments)] pub fn new( index: &'indexer Index, db_fields_ids_map: &'indexer FieldsIdsMap, @@ -252,7 +240,7 @@ impl< doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024)))); let doc_alloc = doc_alloc.0.take(); let fields_ids_map = fields_ids_map_store - .get_or(|| RefCell::new(GlobalFieldsIdsMap::new(&new_fields_ids_map)).into()); + .get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into()); let fields_ids_map = &fields_ids_map.0; let extractor_alloc = extractor_allocs.get_or_default(); diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index cafc59221..00fe6baee 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -6,10 +6,10 @@ use roaring::RoaringBitmap; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; use crate::documents::PrimaryKey; use crate::index::db_name::EXTERNAL_DOCUMENTS_IDS; -use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::update::new::{Deletion, DocumentChange}; use crate::{DocumentId, InternalError, Result}; +#[derive(Default)] pub struct DocumentDeletion { pub to_delete: RoaringBitmap, } @@ -177,8 +177,5 @@ mod test { alloc.get_mut().reset(); } } - drop(deletion_tracker); - drop(changes); - drop(rtxn); } } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 673cd402e..4592feb43 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -1,5 +1,5 @@ use std::cell::RefCell; -use std::sync::{Arc, RwLock}; +use std::sync::RwLock; use std::thread::{self, Builder}; use big_s::S; @@ -11,27 +11,25 @@ pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; use heed::{RoTxn, RwTxn}; pub use partial_dump::PartialDump; -use rayon::iter::{IndexedParallelIterator, IntoParallelIterator}; use rayon::ThreadPool; pub use update_by_function::UpdateByFunction; use super::channel::*; use super::document::write_to_obkv; -use super::document_change::{Deletion, DocumentChange, Insertion, Update}; +use super::document_change::DocumentChange; use super::extract::*; use super::merger::{merge_grenad_entries, FacetFieldIdsDelta}; use super::word_fst_builder::PrefixDelta; use super::words_prefix_docids::{ compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, }; -use super::{extract, StdResult, TopLevelMap}; +use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::facet::FacetType; use crate::update::new::channel::ExtractorSender; -use crate::update::new::parallel_iterator_ext::ParallelIteratorExt; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::{fields_ids_map, Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; mod de; pub mod document_changes; @@ -49,7 +47,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { fn init_data( &self, - extractor_alloc: raw_collections::alloc::RefBump<'extractor>, + _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, ) -> Result { Ok(FullySend(())) } @@ -271,7 +269,7 @@ where Ok(()) as Result<_> })?; - drop(indexing_context); + // required to into_inner the new_fields_ids_map drop(fields_ids_map_store); let fields_ids_map = new_fields_ids_map.into_inner().unwrap(); diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 524608801..80556ced9 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -11,14 +11,12 @@ use roaring::RoaringBitmap; use super::channel::*; use super::extract::FacetKind; use super::word_fst_builder::{PrefixData, PrefixDelta, PrefixSettings}; -use super::{Deletion, DocumentChange, Insertion, KvReaderDelAdd, KvReaderFieldId, Update}; +use super::{Deletion, DocumentChange, KvReaderDelAdd, KvReaderFieldId}; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::update::new::word_fst_builder::WordFstBuilder; use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{ - CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, Prefix, Result, -}; +use crate::{CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, Result}; /// TODO We must return some infos/stats #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")] @@ -27,7 +25,7 @@ pub fn merge_grenad_entries( sender: MergerSender, rtxn: &RoTxn, index: &Index, - mut global_fields_ids_map: GlobalFieldsIdsMap<'_>, + global_fields_ids_map: GlobalFieldsIdsMap<'_>, ) -> Result { let mut buffer: Vec = Vec::new(); let mut documents_ids = index.documents_ids(rtxn)?; @@ -386,7 +384,7 @@ impl FacetFieldIdsDelta { } } - fn extract_key_data<'a>(&self, key: &'a [u8]) -> (FacetKind, FieldId) { + fn extract_key_data(&self, key: &[u8]) -> (FacetKind, FieldId) { let facet_kind = FacetKind::from(key[0]); let field_id = FieldId::from_be_bytes([key[1], key[2]]); (facet_kind, field_id) From 8371819114b6384c50977bc366eb69ce24a528f5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 14 Oct 2024 10:58:37 +0200 Subject: [PATCH 130/247] Some clippy related fixes --- milli/src/update/new/indexer/document_changes.rs | 2 +- milli/src/update/new/indexer/document_operation.rs | 4 ++-- milli/src/update/new/indexer/partial_dump.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/update/new/indexer/document_changes.rs b/milli/src/update/new/indexer/document_changes.rs index 1dd0832f5..2967311df 100644 --- a/milli/src/update/new/indexer/document_changes.rs +++ b/milli/src/update/new/indexer/document_changes.rs @@ -278,7 +278,7 @@ pub trait Extractor<'extractor>: Sync { pub trait DocumentChanges<'pl // lifetime of the underlying payload >: Sync { - type Item; + type Item: Send; fn iter(&self) -> impl IndexedParallelIterator; diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 7978fc46c..3fb592574 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -131,7 +131,7 @@ impl<'pl> DocumentOperation<'pl> { None => { let (docid, is_new) = match index .external_documents_ids() - .get(rtxn, &external_document_id)? + .get(rtxn, external_document_id)? { Some(docid) => (docid, false), None => ( @@ -401,7 +401,7 @@ impl MergeChanges for MergeDocumentForUpdates { let versions = versions.into_bump_slice(); let versions = match versions { - [single] => Versions::Single(*single), + [single] => Versions::Single(single), versions => Versions::Multiple(versions), }; diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 527f5c751..66a7d7fc8 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -67,7 +67,7 @@ where let fields_ids_map = fields_ids_map.deref_mut(); let res = document - .deserialize_map(DocumentVisitor::new(fields_ids_map, self.primary_key, &doc_alloc)) + .deserialize_map(DocumentVisitor::new(fields_ids_map, self.primary_key, doc_alloc)) .map_err(UserError::SerdeJson)?; let external_document_id = match res { From 132916f62cbba599d9ce11a678ed8c7f0435ace5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 9 Oct 2024 11:35:45 +0200 Subject: [PATCH 131/247] Only run word pair proximity docids extraction if proximity_precision enables it --- milli/src/update/new/indexer/mod.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 4592feb43..e49f0ab9c 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -26,10 +26,11 @@ use super::words_prefix_docids::{ use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::facet::FacetType; +use crate::proximity::ProximityPrecision; use crate::update::new::channel::ExtractorSender; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; mod de; pub mod document_changes; @@ -184,7 +185,11 @@ where extractor_sender.send_searchable::(fid_word_count_docids).unwrap(); } - { + // run the proximity extraction only if the precision is by word + // this works only if the settings didn't change during this transaction. + let rtxn = index.read_txn().unwrap(); + let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); + if proximity_precision == ProximityPrecision::ByWord { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); let _entered = span.enter(); extract_and_send_docids::< From a2fbf2ea21139adc5a061a9856f612a9b9401fb7 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 9 Oct 2024 13:53:34 +0200 Subject: [PATCH 132/247] set updated at at the end of the indexing --- milli/src/update/new/indexer/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index e49f0ab9c..c634e22b6 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -12,6 +12,7 @@ pub use document_operation::DocumentOperation; use heed::{RoTxn, RwTxn}; pub use partial_dump::PartialDump; use rayon::ThreadPool; +use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; use super::channel::*; @@ -285,6 +286,8 @@ where inner_index_settings.recompute_facets(wtxn, index)?; inner_index_settings.recompute_searchables(wtxn, index)?; + index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + Ok(()) } From d675e73af114bd6e70d75429356af966d2af768a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 14 Oct 2024 11:12:10 +0200 Subject: [PATCH 133/247] Finish prefix databases --- milli/src/index.rs | 15 +++ milli/src/update/new/indexer/mod.rs | 3 + milli/src/update/new/merger.rs | 9 +- milli/src/update/new/word_fst_builder.rs | 20 ++-- milli/src/update/new/words_prefix_docids.rs | 120 +++++++++++++++++--- 5 files changed, 131 insertions(+), 36 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 4a7f2c42b..19064e8d7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1669,6 +1669,14 @@ impl Index { } Ok(res) } + + pub fn prefix_settings(&self, _rtxn: &RoTxn<'_>) -> Result { + Ok(PrefixSettings { + compute_prefixes: true, + max_prefix_length: 4, + prefix_count_threshold: 100, + }) + } } #[derive(Debug, Deserialize, Serialize)] @@ -1678,6 +1686,13 @@ pub struct IndexEmbeddingConfig { pub user_provided: RoaringBitmap, } +#[derive(Debug, Deserialize, Serialize)] +pub struct PrefixSettings { + pub prefix_count_threshold: u64, + pub max_prefix_length: usize, + pub compute_prefixes: bool, +} + #[derive(Serialize, Deserialize)] #[serde(transparent)] struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] time::OffsetDateTime); diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index c634e22b6..3de5c176e 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -29,6 +29,7 @@ use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::facet::FacetType; use crate::proximity::ProximityPrecision; use crate::update::new::channel::ExtractorSender; +use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; @@ -301,6 +302,8 @@ fn compute_prefix_database( let PrefixDelta { modified, deleted } = prefix_delta; // Compute word prefix docids compute_word_prefix_docids(wtxn, index, &modified, &deleted)?; + // Compute exact word prefix docids + compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted)?; // Compute word prefix fid docids compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted)?; // Compute word prefix position docids diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 80556ced9..998a5d4a2 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -10,7 +10,7 @@ use roaring::RoaringBitmap; use super::channel::*; use super::extract::FacetKind; -use super::word_fst_builder::{PrefixData, PrefixDelta, PrefixSettings}; +use super::word_fst_builder::{PrefixData, PrefixDelta}; use super::{Deletion, DocumentChange, KvReaderDelAdd, KvReaderFieldId}; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; @@ -63,12 +63,7 @@ pub fn merge_grenad_entries( MergerOperation::WordDocidsMerger(merger) => { let words_fst = index.words_fst(rtxn)?; let mut word_fst_builder = WordFstBuilder::new(&words_fst)?; - /// TODO make this configurable - let prefix_settings = PrefixSettings { - compute_prefixes: true, - max_prefix_length: 4, - prefix_count_threshold: 100, - }; + let prefix_settings = index.prefix_settings(rtxn)?; word_fst_builder.with_prefix_settings(prefix_settings); { diff --git a/milli/src/update/new/word_fst_builder.rs b/milli/src/update/new/word_fst_builder.rs index 97cd47e73..867d3e86d 100644 --- a/milli/src/update/new/word_fst_builder.rs +++ b/milli/src/update/new/word_fst_builder.rs @@ -5,7 +5,7 @@ use memmap2::Mmap; use std::collections::HashSet; use tempfile::tempfile; -use crate::{update::del_add::DelAdd, Prefix, Result}; +use crate::{index::PrefixSettings, update::del_add::DelAdd, InternalError, Prefix, Result}; pub struct WordFstBuilder<'a> { stream: Option>, @@ -143,8 +143,10 @@ impl<'a> WordFstBuilder<'a> { ) -> Result<(Mmap, Option)> { self.drain_stream()?; - /// TODO: ugly unwrap - let words_fst_file = self.word_fst_builder.into_inner()?.into_inner().unwrap(); + let words_fst_file = + self.word_fst_builder.into_inner()?.into_inner().map_err(|_| { + InternalError::IndexingMergingKeys { process: "building-words-fst" } + })?; let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; let prefix_data = self @@ -156,13 +158,6 @@ impl<'a> WordFstBuilder<'a> { } } -#[derive(Debug)] -pub struct PrefixSettings { - pub prefix_count_threshold: u64, - pub max_prefix_length: usize, - pub compute_prefixes: bool, -} - pub struct PrefixData { pub prefixes_fst_mmap: Mmap, pub prefix_delta: PrefixDelta, @@ -269,8 +264,9 @@ impl PrefixFstBuilder { let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); let mut builder = SetBuilder::new(BufWriter::new(tempfile()?))?; builder.extend_stream(op.r#union())?; - /// TODO: ugly unwrap - let prefix_fst_file = builder.into_inner()?.into_inner().unwrap(); + let prefix_fst_file = builder.into_inner()?.into_inner().map_err(|_| { + InternalError::IndexingMergingKeys { process: "building-words-prefixes-fst" } + })?; let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? }; let new_prefix_fst = Set::new(&prefix_fst_mmap)?; let old_prefix_fst = index.words_prefixes_fst(rtxn)?; diff --git a/milli/src/update/new/words_prefix_docids.rs b/milli/src/update/new/words_prefix_docids.rs index 32a22ba73..8795fd9a4 100644 --- a/milli/src/update/new/words_prefix_docids.rs +++ b/milli/src/update/new/words_prefix_docids.rs @@ -1,9 +1,11 @@ use std::collections::HashSet; -use heed::Database; +use hashbrown::HashMap; use heed::{types::Bytes, RwTxn}; +use heed::{BytesDecode, Database}; use roaring::RoaringBitmap; +use crate::heed_codec::StrBEU16Codec; use crate::{CboRoaringBitmapCodec, Index, Prefix, Result}; struct WordPrefixDocids { @@ -25,23 +27,10 @@ impl WordPrefixDocids { prefix_to_compute: &HashSet, prefix_to_delete: &HashSet, ) -> Result<()> { - self.delete_prefixes(wtxn, prefix_to_delete)?; + delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?; self.recompute_modified_prefixes(wtxn, prefix_to_compute) } - #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] - fn delete_prefixes(&self, wtxn: &mut heed::RwTxn, prefixes: &HashSet) -> Result<()> { - // We remove all the entries that are no more required in this word prefix docids database. - for prefix in prefixes { - let prefix = prefix.as_bytes(); - if !self.prefix_database.delete(wtxn, prefix)? { - unreachable!("We tried to delete an unknown key") - } - } - - Ok(()) - } - #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] fn recompute_modified_prefixes( &self, @@ -65,6 +54,89 @@ impl WordPrefixDocids { } } +struct WordPrefixIntegerDocids { + database: Database, + prefix_database: Database, +} + +impl WordPrefixIntegerDocids { + fn new( + database: Database, + prefix_database: Database, + ) -> WordPrefixIntegerDocids { + WordPrefixIntegerDocids { database, prefix_database } + } + + fn execute( + self, + wtxn: &mut heed::RwTxn, + prefix_to_compute: &HashSet, + prefix_to_delete: &HashSet, + ) -> Result<()> { + delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?; + self.recompute_modified_prefixes(wtxn, prefix_to_compute) + } + + #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] + fn recompute_modified_prefixes( + &self, + wtxn: &mut RwTxn, + prefixes: &HashSet, + ) -> Result<()> { + // We fetch the docids associated to the newly added word prefix fst only. + // We use a HashMap to store the docids associated to each position, may be RAM consuming. + let mut integer_docids = HashMap::new(); + let mut key_buffer = Vec::new(); + for prefix in prefixes { + let prefix = prefix.as_bytes(); + for result in self.database.prefix_iter(wtxn, prefix)? { + let (key, data) = result?; + let (_word, pos) = + StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?; + + match integer_docids.get_mut(&pos) { + Some(docids) => { + *docids |= &data; + } + None => { + integer_docids.insert(pos, data); + } + } + } + + for (pos, docids) in integer_docids.iter_mut() { + if !docids.is_empty() { + key_buffer.clear(); + key_buffer.extend_from_slice(prefix); + key_buffer.push(0); + key_buffer.extend_from_slice(&pos.to_be_bytes()); + self.prefix_database.put(wtxn, &key_buffer, &docids)?; + } + docids.clear(); + } + } + + Ok(()) + } +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] +fn delete_prefixes( + wtxn: &mut RwTxn, + prefix_database: &Database, + prefixes: &HashSet, +) -> Result<()> { + // We remove all the entries that are no more required in this word prefix docids database. + for prefix in prefixes { + let prefix = prefix.as_bytes(); + if !prefix_database.delete(wtxn, prefix)? { + unreachable!("We tried to delete an unknown key") + } + } + + Ok(()) +} + #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] pub fn compute_word_prefix_docids( wtxn: &mut RwTxn, @@ -80,13 +152,27 @@ pub fn compute_word_prefix_docids( } #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] -pub fn compute_word_prefix_fid_docids( +pub fn compute_exact_word_prefix_docids( wtxn: &mut RwTxn, index: &Index, prefix_to_compute: &HashSet, prefix_to_delete: &HashSet, ) -> Result<()> { WordPrefixDocids::new( + index.exact_word_docids.remap_key_type(), + index.exact_word_prefix_docids.remap_key_type(), + ) + .execute(wtxn, prefix_to_compute, prefix_to_delete) +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] +pub fn compute_word_prefix_fid_docids( + wtxn: &mut RwTxn, + index: &Index, + prefix_to_compute: &HashSet, + prefix_to_delete: &HashSet, +) -> Result<()> { + WordPrefixIntegerDocids::new( index.word_fid_docids.remap_key_type(), index.word_prefix_fid_docids.remap_key_type(), ) @@ -100,7 +186,7 @@ pub fn compute_word_prefix_position_docids( prefix_to_compute: &HashSet, prefix_to_delete: &HashSet, ) -> Result<()> { - WordPrefixDocids::new( + WordPrefixIntegerDocids::new( index.word_position_docids.remap_key_type(), index.word_prefix_position_docids.remap_key_type(), ) From 4e97e38177190f865db51b8986b54795e55b934d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 9 Oct 2024 14:39:27 +0200 Subject: [PATCH 134/247] Serialize docids bitmap one time --- milli/src/update/new/channel.rs | 14 +++++++++++--- milli/src/update/new/merger.rs | 11 +---------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index bcac0fa03..8226046e6 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -5,6 +5,7 @@ use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use grenad::Merger; use heed::types::Bytes; use memmap2::Mmap; +use roaring::RoaringBitmap; use super::extract::FacetKind; use super::StdResult; @@ -46,6 +47,13 @@ impl KeyValueEntry { KeyValueEntry::SmallInMemory { key_length: key.len(), data: data.into_boxed_slice() } } + pub fn from_small_key_bitmap(key: &[u8], bitmap: RoaringBitmap) -> Self { + let mut data = Vec::with_capacity(key.len() + bitmap.serialized_size()); + data.extend_from_slice(key); + bitmap.serialize_into(&mut data).unwrap(); + KeyValueEntry::SmallInMemory { key_length: key.len(), data: data.into_boxed_slice() } + } + pub fn from_large_key_value(key: &[u8], value: Mmap) -> Self { KeyValueEntry::LargeOnDisk { key: key.to_vec().into_boxed_slice(), value } } @@ -232,10 +240,10 @@ impl MergerSender { DocumentsSender(self) } - pub fn send_documents_ids(&self, bitmap: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( + pub fn send_documents_ids(&self, documents_ids: RoaringBitmap) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_bitmap( DOCUMENTS_IDS_KEY.as_bytes(), - bitmap, + documents_ids, )); match self.send(WriterOperation { database: Database::Main, entry }) { Ok(()) => Ok(()), diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 998a5d4a2..6183beb63 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -189,9 +189,7 @@ pub fn merge_grenad_entries( let _entered = span.enter(); // Send the documents ids unionized with the current one - /// TODO return the slice of bytes directly - serialize_bitmap_into_vec(&documents_ids, &mut buffer); - sender.send_documents_ids(&buffer).unwrap(); + sender.send_documents_ids(documents_ids).unwrap(); } // ... @@ -447,10 +445,3 @@ fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec CboRoaringBitmapCodec::serialize_into(bitmap, buffer); buffer.as_slice() } - -/// TODO Return the slice directly from the serialize_into method -fn serialize_bitmap_into_vec(bitmap: &RoaringBitmap, buffer: &mut Vec) { - buffer.clear(); - bitmap.serialize_into(buffer).unwrap(); - // buffer.as_slice() -} From a525598ad66a6f5d4bcfdef309e654d038721b1a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 14 Oct 2024 10:56:37 +0200 Subject: [PATCH 135/247] Fix facet string indexing --- milli/src/update/new/extract/faceted/extract_facets.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 6ae1b3124..14cc28da4 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -182,7 +182,8 @@ impl FacetedDocidsExtractor { // String // key: fid - level - truncated_string Value::String(s) => { - let truncated = truncate_str(s); + let normalized = crate::normalize_facet(s); + let truncated = truncate_str(&normalized); buffer.clear(); buffer.push(FacetKind::String as u8); buffer.extend_from_slice(&fid.to_be_bytes()); From b4102741e613af3a909e321b75fd476f20380187 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 14 Oct 2024 14:59:40 +0200 Subject: [PATCH 136/247] Fix duplicated fields when a document is modified --- milli/src/update/new/document.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/new/document.rs b/milli/src/update/new/document.rs index 335e2c327..1fb31ceb8 100644 --- a/milli/src/update/new/document.rs +++ b/milli/src/update/new/document.rs @@ -142,9 +142,9 @@ impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d> fn iter_top_level_fields(&self) -> impl Iterator> { let mut new_doc_it = self.new_doc.iter_top_level_fields(); let mut db_it = self.db.iter().flat_map(|db| db.iter_top_level_fields()); + let mut seen_fields = BTreeSet::new(); std::iter::from_fn(move || { - let mut seen_fields = BTreeSet::new(); if let Some(next) = new_doc_it.next() { if let Ok((name, _)) = next { seen_fields.insert(name); From 7df20d8282a1da3ed182376d0493296d438ce95f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 14 Oct 2024 15:39:33 +0200 Subject: [PATCH 137/247] Changes to primary key --- milli/src/documents/primary_key.rs | 78 +++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs index 1662ed2e6..accb270c9 100644 --- a/milli/src/documents/primary_key.rs +++ b/milli/src/documents/primary_key.rs @@ -2,9 +2,13 @@ use std::borrow::Cow; use std::iter; use std::result::Result as StdResult; +use bumpalo::Bump; +use serde_json::value::RawValue; use serde_json::{from_str, Value}; -use crate::update::new::{CowStr, TopLevelMap}; +use crate::fields_ids_map::MutFieldIdMapper; +use crate::update::new::indexer::de::DeOrBumpStr; +use crate::update::new::{CowStr, KvReaderFieldId, TopLevelMap}; use crate::{FieldId, InternalError, Object, Result, UserError}; /// The symbol used to define levels in a nested primary key. @@ -117,6 +121,78 @@ impl<'a> PrimaryKey<'a> { } } + pub fn extract_docid_from_db<'pl, 'bump: 'pl, Mapper: FieldIdMapper>( + &self, + document: &'pl KvReaderFieldId, + db_fields_ids_map: &Mapper, + indexer: &'bump Bump, + ) -> Result> { + use serde::Deserializer as _; + + match self { + PrimaryKey::Flat { name: _, field_id } => { + let Some(document_id) = document.get(*field_id) else { + return Err(InternalError::DocumentsError( + crate::documents::Error::InvalidDocumentFormat, + ) + .into()); + }; + + let document_id: &RawValue = + serde_json::from_slice(document_id).map_err(InternalError::SerdeJson)?; + + let document_id = document_id + .deserialize_any(crate::update::new::indexer::de::DocumentIdVisitor(indexer)) + .map_err(InternalError::SerdeJson)?; + + let external_document_id = match document_id { + Ok(document_id) => Ok(document_id), + Err(_) => Err(InternalError::DocumentsError( + crate::documents::Error::InvalidDocumentFormat, + )), + }?; + + Ok(external_document_id) + } + PrimaryKey::Nested { name } => todo!(), + } + } + + pub fn extract_fields_and_docid<'pl, 'bump: 'pl, Mapper: MutFieldIdMapper>( + &self, + document: &'pl RawValue, + new_fields_ids_map: &mut Mapper, + indexer: &'bump Bump, + ) -> Result> { + use serde::Deserializer as _; + let res = document + .deserialize_map(crate::update::new::indexer::de::FieldAndDocidExtractor::new( + new_fields_ids_map, + self, + indexer, + )) + .map_err(UserError::SerdeJson)?; + + let external_document_id = match res { + Ok(document_id) => Ok(document_id), + Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e), + Err(DocumentIdExtractionError::MissingDocumentId) => { + Err(UserError::MissingDocumentId { + primary_key: self.name().to_string(), + document: serde_json::from_str(document.get()).unwrap(), + }) + } + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: self.name().to_string(), + document: serde_json::from_str(document.get()).unwrap(), + }) + } + }?; + + Ok(external_document_id) + } + /// Returns the document ID based on the primary and /// search for it recursively in zero-copy-deserialized documents. pub fn document_id_from_top_level_map<'p>( From 28d92c521a03ba1be58cf7cc532229a035e0aebf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 14 Oct 2024 15:40:42 +0200 Subject: [PATCH 138/247] External docids to &'bump str --- milli/src/update/new/document_change.rs | 32 ++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index a789b32b7..63b878854 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -6,26 +6,26 @@ use crate::documents::FieldIdMapper; use crate::{DocumentId, Index, Result}; pub enum DocumentChange<'doc> { - Deletion(Deletion), + Deletion(Deletion<'doc>), Update(Update<'doc>), Insertion(Insertion<'doc>), } -pub struct Deletion { +pub struct Deletion<'doc> { docid: DocumentId, - external_document_id: String, + external_document_id: &'doc str, } pub struct Update<'doc> { docid: DocumentId, - external_document_id: String, + external_document_id: &'doc str, new: DocumentFromVersions<'doc>, has_deletion: bool, } pub struct Insertion<'doc> { docid: DocumentId, - external_document_id: String, + external_document_id: &'doc str, new: DocumentFromVersions<'doc>, } @@ -38,7 +38,7 @@ impl<'doc> DocumentChange<'doc> { } } - pub fn external_docid(&self) -> &str { + pub fn external_docid(&self) -> &'doc str { match self { DocumentChange::Deletion(deletion) => deletion.external_document_id(), DocumentChange::Update(update) => update.external_document_id(), @@ -47,8 +47,8 @@ impl<'doc> DocumentChange<'doc> { } } -impl Deletion { - pub fn create(docid: DocumentId, external_document_id: String) -> Self { +impl<'doc> Deletion<'doc> { + pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self { Self { docid, external_document_id } } @@ -56,8 +56,8 @@ impl Deletion { self.docid } - pub fn external_document_id(&self) -> &str { - &self.external_document_id + pub fn external_document_id(&self) -> &'doc str { + self.external_document_id } pub fn current<'a, Mapper: FieldIdMapper>( @@ -75,7 +75,7 @@ impl Deletion { impl<'doc> Insertion<'doc> { pub fn create( docid: DocumentId, - external_document_id: String, + external_document_id: &'doc str, new: DocumentFromVersions<'doc>, ) -> Self { Insertion { docid, external_document_id, new } @@ -85,8 +85,8 @@ impl<'doc> Insertion<'doc> { self.docid } - pub fn external_document_id(&self) -> &str { - &self.external_document_id + pub fn external_document_id(&self) -> &'doc str { + self.external_document_id } pub fn new(&self) -> DocumentFromVersions<'doc> { self.new @@ -96,7 +96,7 @@ impl<'doc> Insertion<'doc> { impl<'doc> Update<'doc> { pub fn create( docid: DocumentId, - external_document_id: String, + external_document_id: &'doc str, new: DocumentFromVersions<'doc>, has_deletion: bool, ) -> Self { @@ -107,8 +107,8 @@ impl<'doc> Update<'doc> { self.docid } - pub fn external_document_id(&self) -> &str { - &self.external_document_id + pub fn external_document_id(&self) -> &'doc str { + self.external_document_id } pub fn current<'a, Mapper: FieldIdMapper>( &self, From 6ad3f57bc16e5c27bc28ec0de310c4b93bc82a96 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 14 Oct 2024 15:41:13 +0200 Subject: [PATCH 139/247] Changes to de --- milli/src/update/new/indexer/de.rs | 170 +++++++++++++++++++++++++---- 1 file changed, 151 insertions(+), 19 deletions(-) diff --git a/milli/src/update/new/indexer/de.rs b/milli/src/update/new/indexer/de.rs index 7976433b9..9a664b5f8 100644 --- a/milli/src/update/new/indexer/de.rs +++ b/milli/src/update/new/indexer/de.rs @@ -1,18 +1,36 @@ use bumpalo::Bump; use serde_json::value::RawValue; -use crate::documents::{validate_document_id_str, DocumentIdExtractionError, PrimaryKey}; +use crate::documents::{ + validate_document_id_str, DocumentIdExtractionError, FieldIdMapper, PrimaryKey, +}; use crate::fields_ids_map::MutFieldIdMapper; use crate::{FieldId, UserError}; // visits a document to fill the top level fields of the field id map and retrieve the external document id. -pub struct DocumentVisitor<'p, 'indexer, Mapper: MutFieldIdMapper> { +pub struct FieldAndDocidExtractor<'p, 'indexer, Mapper: MutFieldIdMapper> { fields_ids_map: &'p mut Mapper, primary_key: &'p PrimaryKey<'p>, indexer: &'indexer Bump, } -impl<'p, 'indexer, Mapper: MutFieldIdMapper> DocumentVisitor<'p, 'indexer, Mapper> { +pub struct DocidExtractor<'p, 'indexer, Mapper: FieldIdMapper> { + fields_ids_map: &'p Mapper, + primary_key: &'p PrimaryKey<'p>, + indexer: &'indexer Bump, +} + +impl<'p, 'indexer, Mapper: FieldIdMapper> DocidExtractor<'p, 'indexer, Mapper> { + pub fn new( + fields_ids_map: &'p Mapper, + primary_key: &'p PrimaryKey<'p>, + indexer: &'indexer Bump, + ) -> Self { + Self { fields_ids_map, primary_key, indexer } + } +} + +impl<'p, 'indexer, Mapper: MutFieldIdMapper> FieldAndDocidExtractor<'p, 'indexer, Mapper> { pub fn new( fields_ids_map: &'p mut Mapper, primary_key: &'p PrimaryKey<'p>, @@ -23,9 +41,9 @@ impl<'p, 'indexer, Mapper: MutFieldIdMapper> DocumentVisitor<'p, 'indexer, Mappe } impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> - for DocumentVisitor<'p, 'indexer, Mapper> + for FieldAndDocidExtractor<'p, 'indexer, Mapper> { - type Value = std::result::Result<&'de str, DocumentIdExtractionError>; + type Value = std::result::Result, DocumentIdExtractionError>; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { write!(formatter, "a map") @@ -37,7 +55,7 @@ impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> { let mut docid = None; while let Some((fid, fields_ids_map)) = - map.next_key_seed(FieldIdMapSeed(self.fields_ids_map))? + map.next_key_seed(MutFieldIdMapSeed(self.fields_ids_map))? { use serde::de::Deserializer as _; self.fields_ids_map = fields_ids_map; @@ -75,10 +93,83 @@ impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> } } -struct FieldIdMapSeed<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper); +impl<'de, 'p, 'indexer: 'de, Mapper: FieldIdMapper> serde::de::Visitor<'de> + for DocidExtractor<'p, 'indexer, Mapper> +{ + type Value = std::result::Result, DocumentIdExtractionError>; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "a map") + } + + fn visit_map(self, mut map: A) -> std::result::Result + where + A: serde::de::MapAccess<'de>, + { + let mut docid = None; + while let Some(fid) = map.next_key_seed(FieldIdMapSeed(self.fields_ids_map))? { + use serde::de::Deserializer as _; + + let Some(fid) = fid else { + continue; + }; + + match self.primary_key { + PrimaryKey::Flat { name: _, field_id } => { + let value: &'de RawValue = map.next_value()?; + if fid == *field_id { + let value = match value + .deserialize_any(DocumentIdVisitor(self.indexer)) + .map_err(|_err| { + DocumentIdExtractionError::InvalidDocumentId( + UserError::InvalidDocumentId { + document_id: serde_json::to_value(value).unwrap(), + }, + ) + }) { + Ok(Ok(value)) => value, + Ok(Err(err)) | Err(err) => return Ok(Err(err)), + }; + if let Some(_previous_value) = docid.replace(value) { + return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(2))); + } + } + } + PrimaryKey::Nested { name } => todo!(), + } + } + Ok(match docid { + Some(docid) => Ok(docid), + None => Err(DocumentIdExtractionError::MissingDocumentId), + }) + } +} + +pub enum DeOrBumpStr<'de, 'bump: 'de> { + De(&'de str), + Bump(&'bump str), +} + +impl<'de, 'bump: 'de> DeOrBumpStr<'de, 'bump> { + pub fn to_bump(&self, bump: &'bump Bump) -> &'bump str { + match self { + DeOrBumpStr::De(de) => bump.alloc_str(de), + DeOrBumpStr::Bump(bump) => *bump, + } + } + + pub fn to_de(&self) -> &'de str { + match self { + DeOrBumpStr::De(de) => *de, + DeOrBumpStr::Bump(bump) => *bump, + } + } +} + +struct MutFieldIdMapSeed<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper); impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::DeserializeSeed<'de> - for FieldIdMapSeed<'a, Mapper> + for MutFieldIdMapSeed<'a, Mapper> { type Value = (Option, &'a mut Mapper); @@ -86,8 +177,10 @@ impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::DeserializeSeed<'de> where D: serde::Deserializer<'de>, { - struct FieldIdMapVisitor<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper); - impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> for FieldIdMapVisitor<'a, Mapper> { + struct MutFieldIdMapVisitor<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper); + impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> + for MutFieldIdMapVisitor<'a, Mapper> + { type Value = (Option, &'a mut Mapper); fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { @@ -107,14 +200,50 @@ impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::DeserializeSeed<'de> Ok((self.0.insert(v), self.0)) } } + deserializer.deserialize_str(MutFieldIdMapVisitor(self.0)) + } +} + +struct FieldIdMapSeed<'a, Mapper: FieldIdMapper>(&'a Mapper); + +impl<'de, 'a, Mapper: FieldIdMapper> serde::de::DeserializeSeed<'de> + for FieldIdMapSeed<'a, Mapper> +{ + type Value = Option; + + fn deserialize(self, deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct FieldIdMapVisitor<'a, Mapper: FieldIdMapper>(&'a Mapper); + impl<'de, 'a, Mapper: FieldIdMapper> serde::de::Visitor<'de> for FieldIdMapVisitor<'a, Mapper> { + type Value = Option; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "expecting a string") + } + fn visit_borrowed_str(self, v: &'de str) -> std::result::Result + where + E: serde::de::Error, + { + Ok(self.0.id(v)) + } + + fn visit_str(self, v: &str) -> std::result::Result + where + E: serde::de::Error, + { + Ok(self.0.id(v)) + } + } deserializer.deserialize_str(FieldIdMapVisitor(self.0)) } } -struct DocumentIdVisitor<'indexer>(&'indexer Bump); +pub struct DocumentIdVisitor<'indexer>(pub &'indexer Bump); impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> { - type Value = std::result::Result<&'de str, DocumentIdExtractionError>; + type Value = std::result::Result, DocumentIdExtractionError>; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { write!(formatter, "an integer or a string") @@ -124,11 +253,13 @@ impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> where E: serde::de::Error, { - Ok(validate_document_id_str(v).ok_or_else(|| { - DocumentIdExtractionError::InvalidDocumentId(UserError::InvalidDocumentId { - document_id: serde_json::Value::String(v.to_owned()), + Ok(validate_document_id_str(v) + .ok_or_else(|| { + DocumentIdExtractionError::InvalidDocumentId(UserError::InvalidDocumentId { + document_id: serde_json::Value::String(v.to_owned()), + }) }) - })) + .map(DeOrBumpStr::De)) } fn visit_str(self, v: &str) -> std::result::Result @@ -136,7 +267,8 @@ impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> E: serde::de::Error, { let v = self.0.alloc_str(v); - self.visit_borrowed_str(v) + self.visit_borrowed_str(v)?; + Ok(Ok(DeOrBumpStr::Bump(v))) } fn visit_u64(self, v: u64) -> std::result::Result @@ -147,7 +279,7 @@ impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> let mut out = bumpalo::collections::String::new_in(self.0); write!(&mut out, "{v}").unwrap(); - Ok(Ok(out.into_bump_str())) + Ok(Ok(DeOrBumpStr::Bump(out.into_bump_str()))) } fn visit_i64(self, v: i64) -> std::result::Result @@ -158,6 +290,6 @@ impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> let mut out = bumpalo::collections::String::new_in(&self.0); write!(&mut out, "{v}"); - Ok(Ok(out.into_bump_str())) + Ok(Ok(DeOrBumpStr::Bump(out.into_bump_str()))) } } From c01ee7b732b2d96b024ac61f835c05edd5f0c0da Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 14 Oct 2024 15:41:31 +0200 Subject: [PATCH 140/247] external changes --- .../update/new/indexer/document_deletion.rs | 18 +++++----- .../update/new/indexer/document_operation.rs | 33 +++++++------------ milli/src/update/new/indexer/partial_dump.rs | 28 ++++------------ 3 files changed, 27 insertions(+), 52 deletions(-) diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index 00fe6baee..99ed4f54c 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -5,9 +5,8 @@ use roaring::RoaringBitmap; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; use crate::documents::PrimaryKey; -use crate::index::db_name::EXTERNAL_DOCUMENTS_IDS; use crate::update::new::{Deletion, DocumentChange}; -use crate::{DocumentId, InternalError, Result}; +use crate::{DocumentId, Result}; #[derive(Default)] pub struct DocumentDeletion { @@ -61,12 +60,15 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { 'pl: 'doc, // the payload must survive the process calls { let current = context.index.document(&context.txn, docid)?; - let new_fields_ids_map = context.new_fields_ids_map.borrow(); - let new_fields_ids_map = new_fields_ids_map.local_map(); - let external_document_id = - self.primary_key.document_id(current, new_fields_ids_map)?.map_err(|_| { - InternalError::DatabaseMissingEntry { db_name: EXTERNAL_DOCUMENTS_IDS, key: None } - })?; + + let external_document_id = self.primary_key.extract_docid_from_db( + current, + &context.db_fields_ids_map, + &context.doc_alloc, + )?; + + let external_document_id = external_document_id.to_bump(&context.doc_alloc); + Ok(DocumentChange::Deletion(Deletion::create(docid, external_document_id))) } } diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 3fb592574..143244a6b 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -11,10 +11,10 @@ use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend use crate::documents::{DocumentIdExtractionError, PrimaryKey}; use crate::update::new::document::DocumentFromVersions; use crate::update::new::document_change::Versions; -use crate::update::new::indexer::de::DocumentVisitor; +use crate::update::new::indexer::de::FieldAndDocidExtractor; use crate::update::new::{Deletion, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; -use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; +use crate::{external_documents_ids, DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; pub struct DocumentOperation<'pl> { operations: Vec>, @@ -98,7 +98,7 @@ impl<'pl> DocumentOperation<'pl> { iter.next().transpose().map_err(UserError::SerdeJson)? { let res = document - .deserialize_map(DocumentVisitor::new( + .deserialize_map(FieldAndDocidExtractor::new( new_fields_ids_map, primary_key, indexer, @@ -122,6 +122,8 @@ impl<'pl> DocumentOperation<'pl> { } }?; + let external_document_id = external_document_id.to_de(); + let current_offset = iter.byte_offset(); let document_operation = InnerDocOp::Addition(DocumentOffset { content: &payload[previous_offset..current_offset], @@ -310,23 +312,14 @@ impl MergeChanges for MergeDocumentForReplacement { let document = DocumentFromVersions::new(Versions::Single(document)); if is_new { - Ok(DocumentChange::Insertion(Insertion::create( - docid, - external_doc.to_owned(), - document, - ))) + Ok(DocumentChange::Insertion(Insertion::create(docid, external_doc, document))) } else { - Ok(DocumentChange::Update(Update::create( - docid, - external_doc.to_owned(), - document, - true, - ))) + Ok(DocumentChange::Update(Update::create(docid, external_doc, document, true))) } } Some(InnerDocOp::Deletion) => { let deletion = if is_new { - Deletion::create(docid, external_doc.to_owned()) + Deletion::create(docid, external_doc) } else { todo!("Do that with Louis") }; @@ -373,7 +366,7 @@ impl MergeChanges for MergeDocumentForUpdates { if operations.is_empty() { let deletion = if !is_new { - Deletion::create(docid, external_docid.to_owned()) + Deletion::create(docid, external_docid) } else { todo!("Do that with Louis") }; @@ -408,15 +401,11 @@ impl MergeChanges for MergeDocumentForUpdates { let document = DocumentFromVersions::new(versions); if is_new { - Ok(DocumentChange::Insertion(Insertion::create( - docid, - external_docid.to_owned(), - document, - ))) + Ok(DocumentChange::Insertion(Insertion::create(docid, external_docid, document))) } else { Ok(DocumentChange::Update(Update::create( docid, - external_docid.to_owned(), + external_docid, document, has_deletion, ))) diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 66a7d7fc8..4d31f600d 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -4,7 +4,7 @@ use rayon::iter::IndexedParallelIterator; use serde::Deserializer; use serde_json::value::RawValue; -use super::de::DocumentVisitor; +use super::de::FieldAndDocidExtractor; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; @@ -66,36 +66,20 @@ where let mut fields_ids_map = context.new_fields_ids_map.borrow_mut(); let fields_ids_map = fields_ids_map.deref_mut(); - let res = document - .deserialize_map(DocumentVisitor::new(fields_ids_map, self.primary_key, doc_alloc)) - .map_err(UserError::SerdeJson)?; - - let external_document_id = match res { - Ok(document_id) => Ok(document_id), - Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e), - Err(DocumentIdExtractionError::MissingDocumentId) => { - Err(UserError::MissingDocumentId { - primary_key: self.primary_key.name().to_string(), - document: serde_json::from_str(document.get()).unwrap(), - }) - } - Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { - Err(UserError::TooManyDocumentIds { - primary_key: self.primary_key.name().to_string(), - document: serde_json::from_str(document.get()).unwrap(), - }) - } - }?; let document = doc_alloc.alloc_str(document.get()); let document: &RawValue = unsafe { std::mem::transmute(document) }; + let external_document_id = + self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?; + let external_document_id = external_document_id.to_de(); + let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) .map_err(InternalError::SerdeJson)?; let document = document.into_bump_slice(); let document = DocumentFromVersions::new(Versions::Single(document)); - let insertion = Insertion::create(docid, external_document_id.to_owned(), document); + let insertion = Insertion::create(docid, external_document_id, document); Ok(DocumentChange::Insertion(insertion)) } } From 96658ec775297c613f917f12a7547ed0e6de6196 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 14 Oct 2024 15:41:41 +0200 Subject: [PATCH 141/247] Make de public --- milli/src/update/new/indexer/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 3de5c176e..4f56b52b1 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -34,7 +34,7 @@ use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; -mod de; +pub(crate) mod de; pub mod document_changes; mod document_deletion; mod document_operation; From 7e1dc8439bdcf1e5ae37cc178d879ea4a07e0199 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 14 Oct 2024 15:41:10 +0200 Subject: [PATCH 142/247] Introduce the new update by function --- index-scheduler/src/batch.rs | 122 +++++---- .../update/new/indexer/document_changes.rs | 9 +- .../update/new/indexer/document_deletion.rs | 4 +- .../update/new/indexer/document_operation.rs | 44 ++-- milli/src/update/new/indexer/partial_dump.rs | 4 +- .../update/new/indexer/update_by_function.rs | 235 ++++++++++++++++-- milli/src/update/new/words_prefix_docids.rs | 6 +- 7 files changed, 336 insertions(+), 88 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 2bd20b6e8..5ce658dd9 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -30,7 +30,9 @@ use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::heed::CompactionOption; -use meilisearch_types::milli::update::new::indexer::{self, retrieve_or_guess_primary_key}; +use meilisearch_types::milli::update::new::indexer::{ + self, retrieve_or_guess_primary_key, UpdateByFunction, +}; use meilisearch_types::milli::update::{ IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; @@ -1392,7 +1394,7 @@ impl IndexScheduler { Ok(tasks) } IndexOperation::DocumentEdition { mut task, .. } => { - let (filter, context, function) = + let (filter, context, code) = if let KindWithContent::DocumentEdition { filter_expr, context, function, .. } = &task.kind @@ -1401,52 +1403,84 @@ impl IndexScheduler { } else { unreachable!() }; - let result_count = edit_documents_by_function( - index_wtxn, - filter, - context.clone(), - function, - self.index_mapper.indexer_config(), - self.must_stop_processing.clone(), - index, - ); - let (original_filter, context, function) = if let Some(Details::DocumentEdition { - original_filter, - context, - function, - .. - }) = task.details - { - (original_filter, context, function) - } else { - // In the case of a `documentEdition` the details MUST be set - unreachable!(); + + let candidates = match filter.as_ref().map(Filter::from_json) { + Some(Ok(Some(filter))) => { + filter.evaluate(index_wtxn, index).map_err(|err| match err { + milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { + Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter) + } + e => e.into(), + })? + } + None | Some(Ok(None)) => index.documents_ids(index_wtxn)?, + Some(Err(e)) => return Err(e.into()), }; - match result_count { - Ok((deleted_documents, edited_documents)) => { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentEdition { - original_filter, - context, - function, - deleted_documents: Some(deleted_documents), - edited_documents: Some(edited_documents), - }); - } - Err(e) => { - task.status = Status::Failed; - task.details = Some(Details::DocumentEdition { - original_filter, - context, - function, - deleted_documents: Some(0), - edited_documents: Some(0), - }); - task.error = Some(e.into()); - } + let rtxn = index.read_txn()?; + let db_fields_ids_map = index.fields_ids_map(&rtxn)?; + let mut new_fields_ids_map = db_fields_ids_map.clone(); + let primary_key = + retrieve_or_guess_primary_key(&rtxn, index, &mut new_fields_ids_map, None)? + .unwrap(); + + if task.error.is_none() { + /// TODO create a pool if needed + // let pool = indexer_config.thread_pool.unwrap(); + let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); + + let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); + let document_changes = indexer.into_changes(&primary_key)?; + + indexer::index( + index_wtxn, + index, + &db_fields_ids_map, + new_fields_ids_map, + &pool, + &document_changes, + )?; + + // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } + // let (original_filter, context, function) = if let Some(Details::DocumentEdition { + // original_filter, + // context, + // function, + // .. + // }) = task.details + // { + // (original_filter, context, function) + // } else { + // // In the case of a `documentEdition` the details MUST be set + // unreachable!(); + // }; + + // match result_count { + // Ok((deleted_documents, edited_documents)) => { + // task.status = Status::Succeeded; + // task.details = Some(Details::DocumentEdition { + // original_filter, + // context, + // function, + // deleted_documents: Some(deleted_documents), + // edited_documents: Some(edited_documents), + // }); + // } + // Err(e) => { + // task.status = Status::Failed; + // task.details = Some(Details::DocumentEdition { + // original_filter, + // context, + // function, + // deleted_documents: Some(0), + // edited_documents: Some(0), + // }); + // task.error = Some(e.into()); + // } + // } + Ok(vec![task]) } IndexOperation::DocumentDeletion { mut tasks, index_uid: _ } => { diff --git a/milli/src/update/new/indexer/document_changes.rs b/milli/src/update/new/indexer/document_changes.rs index 2967311df..18c7cdf02 100644 --- a/milli/src/update/new/indexer/document_changes.rs +++ b/milli/src/update/new/indexer/document_changes.rs @@ -287,7 +287,7 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload &'doc self, context: &'doc DocumentChangeContext, item: Self::Item, - ) -> Result> where 'pl: 'doc // the payload must survive the process calls + ) -> Result>> where 'pl: 'doc // the payload must survive the process calls ; } @@ -352,8 +352,11 @@ where // Clean up and reuse the document-specific allocator context.doc_alloc.reset(); - let change = - document_changes.item_to_document_change(context, item).map_err(Arc::new)?; + let Some(change) = + document_changes.item_to_document_change(context, item).map_err(Arc::new)? + else { + return Ok(()); + }; let res = extractor.process(change, context).map_err(Arc::new); diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index 99ed4f54c..7744bcf18 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -55,7 +55,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { &'doc self, context: &'doc DocumentChangeContext, docid: Self::Item, - ) -> Result> + ) -> Result>> where 'pl: 'doc, // the payload must survive the process calls { @@ -69,7 +69,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { let external_document_id = external_document_id.to_bump(&context.doc_alloc); - Ok(DocumentChange::Deletion(Deletion::create(docid, external_document_id))) + Ok(Some(DocumentChange::Deletion(Deletion::create(docid, external_document_id)))) } } diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 143244a6b..fcab6773a 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -240,7 +240,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> { &'doc self, context: &'doc DocumentChangeContext, item: Self::Item, - ) -> Result> + ) -> Result>> where 'pl: 'doc, { @@ -275,7 +275,7 @@ trait MergeChanges { is_new: bool, doc_alloc: &'doc Bump, operations: &'doc [InnerDocOp], - ) -> Result>; + ) -> Result>>; } struct MergeDocumentForReplacement; @@ -301,7 +301,7 @@ impl MergeChanges for MergeDocumentForReplacement { is_new: bool, doc_alloc: &'doc Bump, operations: &'doc [InnerDocOp], - ) -> Result> { + ) -> Result>> { match operations.last() { Some(InnerDocOp::Addition(DocumentOffset { content })) => { let document = serde_json::from_slice(content).unwrap(); @@ -312,18 +312,27 @@ impl MergeChanges for MergeDocumentForReplacement { let document = DocumentFromVersions::new(Versions::Single(document)); if is_new { - Ok(DocumentChange::Insertion(Insertion::create(docid, external_doc, document))) + Ok(Some(DocumentChange::Insertion(Insertion::create( + docid, + external_doc, + document, + )))) } else { - Ok(DocumentChange::Update(Update::create(docid, external_doc, document, true))) + Ok(Some(DocumentChange::Update(Update::create( + docid, + external_doc, + document, + true, + )))) } } Some(InnerDocOp::Deletion) => { - let deletion = if is_new { - Deletion::create(docid, external_doc) + return if is_new { + let deletion = Deletion::create(docid, external_doc); + Ok(Some(DocumentChange::Deletion(deletion))) } else { - todo!("Do that with Louis") + Ok(None) }; - Ok(DocumentChange::Deletion(deletion)) } None => unreachable!("We must not have empty set of operations on a document"), } @@ -354,7 +363,7 @@ impl MergeChanges for MergeDocumentForUpdates { is_new: bool, doc_alloc: &'doc Bump, operations: &'doc [InnerDocOp], - ) -> Result> { + ) -> Result>> { if operations.is_empty() { unreachable!("We must not have empty set of operations on a document"); } @@ -365,13 +374,12 @@ impl MergeChanges for MergeDocumentForUpdates { let has_deletion = last_deletion.is_some(); if operations.is_empty() { - let deletion = if !is_new { - Deletion::create(docid, external_docid) + return if !is_new { + let deletion = Deletion::create(docid, external_docid); + Ok(Some(DocumentChange::Deletion(deletion))) } else { - todo!("Do that with Louis") + Ok(None) }; - - return Ok(DocumentChange::Deletion(deletion)); } let mut versions = bumpalo::collections::Vec::with_capacity_in(operations.len(), doc_alloc); @@ -401,14 +409,14 @@ impl MergeChanges for MergeDocumentForUpdates { let document = DocumentFromVersions::new(versions); if is_new { - Ok(DocumentChange::Insertion(Insertion::create(docid, external_docid, document))) + Ok(Some(DocumentChange::Insertion(Insertion::create(docid, external_docid, document)))) } else { - Ok(DocumentChange::Update(Update::create( + Ok(Some(DocumentChange::Update(Update::create( docid, external_docid, document, has_deletion, - ))) + )))) } } } diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 4d31f600d..3b528d5e8 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -53,7 +53,7 @@ where &'doc self, context: &'doc DocumentChangeContext, document: Self::Item, - ) -> Result> + ) -> Result>> where 'index: 'doc, { @@ -80,6 +80,6 @@ where let document = DocumentFromVersions::new(Versions::Single(document)); let insertion = Insertion::create(docid, external_document_id, document); - Ok(DocumentChange::Insertion(insertion)) + Ok(Some(DocumentChange::Insertion(insertion))) } } diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index 9bff15b5c..d9b09bd21 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -1,33 +1,236 @@ -use rayon::iter::IntoParallelIterator; +use std::collections::BTreeMap; -use super::document_changes::{DocumentChangeContext, DocumentChanges}; -use crate::Result; +use raw_collections::RawMap; +use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; +use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; +use roaring::RoaringBitmap; -pub struct UpdateByFunction; +use super::document_changes::{DocumentChangeContext, MostlySend}; +use super::DocumentChanges; +use crate::documents::Error::InvalidDocumentFormat; +use crate::documents::PrimaryKey; +use crate::error::{FieldIdMapMissingEntry, InternalError}; +use crate::update::new::document::DocumentFromVersions; +use crate::update::new::document_change::Versions; +use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, KvWriterFieldId, Update}; +use crate::{all_obkv_to_json, Error, FieldsIdsMap, GlobalFieldsIdsMap, Object, Result, UserError}; + +pub struct UpdateByFunction { + documents: RoaringBitmap, + context: Option, + code: String, +} + +pub struct UpdateByFunctionChanges<'doc> { + primary_key: &'doc PrimaryKey<'doc>, + engine: Engine, + ast: AST, + context: Option, + // It is sad that the RoaringBitmap doesn't + // implement IndexedParallelIterator + documents: Vec, +} impl UpdateByFunction { - pub fn into_changes(self) -> UpdateByFunctionChanges { - UpdateByFunctionChanges + pub fn new(documents: RoaringBitmap, context: Option, code: String) -> Self { + UpdateByFunction { documents, context, code } + } + + pub fn into_changes<'index>( + self, + primary_key: &'index PrimaryKey, + ) -> Result> { + let Self { documents, context, code } = self; + + // Setup the security and limits of the Engine + let mut engine = Engine::new(); + engine.set_optimization_level(OptimizationLevel::Full); + engine.set_max_call_levels(1000); + // It is an arbitrary value. We need to let users define this in the settings. + engine.set_max_operations(1_000_000); + engine.set_max_variables(1000); + engine.set_max_functions(30); + engine.set_max_expr_depths(100, 1000); + engine.set_max_string_size(1024 * 1024 * 1024); // 1 GiB + engine.set_max_array_size(10_000); + engine.set_max_map_size(10_000); + + let ast = engine.compile(code).map_err(UserError::DocumentEditionCompilationError)?; + let context = match context { + Some(context) => { + Some(serde_json::from_value(context.into()).map_err(InternalError::SerdeJson)?) + } + None => None, + }; + + Ok(UpdateByFunctionChanges { + primary_key, + engine, + ast, + context, + documents: documents.into_iter().collect(), + }) } } -pub struct UpdateByFunctionChanges; - -impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges { +impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { type Item = u32; - fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator { - (0..100).into_par_iter() + fn iter(&self) -> impl IndexedParallelIterator { + self.documents.par_iter().copied() } - fn item_to_document_change<'doc, T: super::document_changes::MostlySend + 'doc>( + fn item_to_document_change<'doc, T: MostlySend + 'doc>( &self, - _context: &'doc DocumentChangeContext, - _item: Self::Item, - ) -> Result> + context: &'doc DocumentChangeContext, + docid: Self::Item, + ) -> Result>> where 'index: 'doc, { - todo!() + let DocumentChangeContext { + index, + db_fields_ids_map, + txn, + new_fields_ids_map, + doc_alloc, + .. + } = context; + + // safety: Both documents *must* exists in the database as + // their IDs comes from the list of documents ids. + let document = index.document(txn, docid)?; + let rhai_document = obkv_to_rhaimap(document, db_fields_ids_map)?; + let json_document = all_obkv_to_json(document, db_fields_ids_map)?; + + let document_id = self + .primary_key + .document_id(document, db_fields_ids_map)? + .map_err(|_| InvalidDocumentFormat)?; + + let mut scope = Scope::new(); + if let Some(context) = self.context.as_ref().cloned() { + scope.push_constant_dynamic("context", context.clone()); + } + scope.push("doc", rhai_document); + // That's were the magic happens. We run the user script + // which edits "doc" scope variable reprensenting the document + // and ignore the output and even the type of it, i.e., Dynamic. + let _ = self + .engine + .eval_ast_with_scope::(&mut scope, &self.ast) + .map_err(UserError::DocumentEditionRuntimeError)?; + + match scope.remove::("doc") { + // If the "doc" variable has set to (), we effectively delete the document. + Some(doc) if doc.is_unit() => Ok(Some(DocumentChange::Deletion(Deletion::create( + docid, + doc_alloc.alloc_str(&document_id), + )))), + None => unreachable!("missing doc variable from the Rhai scope"), + Some(new_document) => match new_document.try_cast() { + Some(new_rhai_document) => { + let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); + serde_json::to_writer(&mut buffer, &new_rhai_document) + .map_err(InternalError::SerdeJson)?; + let raw_new_doc = serde_json::from_slice(buffer.into_bump_slice()) + .map_err(InternalError::SerdeJson)?; + + // Note: This condition is not perfect. Sometimes it detect changes + // like with floating points numbers and consider updating + // the document even if nothing actually changed. + // + // Future: Use a custom function rhai function to track changes. + // + if json_document != rhaimap_to_object(new_rhai_document) { + let mut global_fields_ids_map = new_fields_ids_map.borrow_mut(); + let new_document_id = self + .primary_key + .extract_fields_and_docid( + raw_new_doc, + &mut *global_fields_ids_map, + doc_alloc, + )? + .to_de(); + + if document_id != new_document_id { + Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey)) + } else { + let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc) + .map_err(InternalError::SerdeJson)?; + let new_doc_version = DocumentFromVersions::new(Versions::Single( + raw_new_doc.into_bump_slice(), + )); + Ok(Some(DocumentChange::Update(Update::create( + docid, + new_document_id, + new_doc_version, + true, // It is like document replacement + )))) + } + } else { + Ok(None) + } + } + None => Err(Error::UserError(UserError::DocumentEditionDocumentMustBeObject)), + }, + } } } + +fn obkv_to_rhaimap(obkv: &KvReaderFieldId, fields_ids_map: &FieldsIdsMap) -> Result { + let all_keys = obkv.iter().map(|(k, _v)| k).collect::>(); + let map: Result = all_keys + .iter() + .copied() + .flat_map(|id| obkv.get(id).map(|value| (id, value))) + .map(|(id, value)| { + let name = fields_ids_map.name(id).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: id, + process: "all_obkv_to_rhaimap", + })?; + let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; + Ok((name.into(), value)) + }) + .collect(); + + map +} + +fn rhaimap_to_object(map: rhai::Map) -> Object { + let mut output = Object::new(); + for (key, value) in map { + let value = serde_json::to_value(&value).unwrap(); + output.insert(key.into(), value); + } + output +} + +fn rhaimap_to_obkv( + map: rhai::Map, + global_fields_ids_map: &mut GlobalFieldsIdsMap, + buffer: &mut Vec, +) -> Result> { + let result: Result> = map + .keys() + .map(|key| { + global_fields_ids_map + .id_or_insert(key) + .ok_or(UserError::AttributeLimitReached) + .map_err(Error::from) + .map(|fid| (fid, key)) + }) + .collect(); + + let ordered_fields = result?; + let mut writer = KvWriterFieldId::memory(); + for (fid, key) in ordered_fields { + let value = map.get(key).unwrap(); + let value = serde_json::to_value(value).unwrap(); + buffer.clear(); + serde_json::to_writer(&mut *buffer, &value).unwrap(); + writer.insert(fid, &buffer)?; + } + + Ok(writer.into_boxed()) +} diff --git a/milli/src/update/new/words_prefix_docids.rs b/milli/src/update/new/words_prefix_docids.rs index 8795fd9a4..d45f6397e 100644 --- a/milli/src/update/new/words_prefix_docids.rs +++ b/milli/src/update/new/words_prefix_docids.rs @@ -1,8 +1,8 @@ use std::collections::HashSet; use hashbrown::HashMap; -use heed::{types::Bytes, RwTxn}; -use heed::{BytesDecode, Database}; +use heed::types::Bytes; +use heed::{BytesDecode, Database, RwTxn}; use roaring::RoaringBitmap; use crate::heed_codec::StrBEU16Codec; @@ -110,7 +110,7 @@ impl WordPrefixIntegerDocids { key_buffer.extend_from_slice(prefix); key_buffer.push(0); key_buffer.extend_from_slice(&pos.to_be_bytes()); - self.prefix_database.put(wtxn, &key_buffer, &docids)?; + self.prefix_database.put(wtxn, &key_buffer, docids)?; } docids.clear(); } From 52b95c4e591eafbe273c32c0b603ac6e2c381221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 14 Oct 2024 16:48:15 +0200 Subject: [PATCH 143/247] Make sure we edit the task statuses --- index-scheduler/src/batch.rs | 72 ++++++++++--------- .../update/new/indexer/update_by_function.rs | 9 ++- 2 files changed, 41 insertions(+), 40 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 5ce658dd9..978ef9ec6 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -1424,6 +1424,8 @@ impl IndexScheduler { retrieve_or_guess_primary_key(&rtxn, index, &mut new_fields_ids_map, None)? .unwrap(); + let result_count = Ok((candidates.len(), candidates.len())) as Result<_>; + if task.error.is_none() { /// TODO create a pool if needed // let pool = indexer_config.thread_pool.unwrap(); @@ -1444,42 +1446,42 @@ impl IndexScheduler { // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } - // let (original_filter, context, function) = if let Some(Details::DocumentEdition { - // original_filter, - // context, - // function, - // .. - // }) = task.details - // { - // (original_filter, context, function) - // } else { - // // In the case of a `documentEdition` the details MUST be set - // unreachable!(); - // }; + let (original_filter, context, function) = if let Some(Details::DocumentEdition { + original_filter, + context, + function, + .. + }) = task.details + { + (original_filter, context, function) + } else { + // In the case of a `documentEdition` the details MUST be set + unreachable!(); + }; - // match result_count { - // Ok((deleted_documents, edited_documents)) => { - // task.status = Status::Succeeded; - // task.details = Some(Details::DocumentEdition { - // original_filter, - // context, - // function, - // deleted_documents: Some(deleted_documents), - // edited_documents: Some(edited_documents), - // }); - // } - // Err(e) => { - // task.status = Status::Failed; - // task.details = Some(Details::DocumentEdition { - // original_filter, - // context, - // function, - // deleted_documents: Some(0), - // edited_documents: Some(0), - // }); - // task.error = Some(e.into()); - // } - // } + match result_count { + Ok((deleted_documents, edited_documents)) => { + task.status = Status::Succeeded; + task.details = Some(Details::DocumentEdition { + original_filter, + context, + function, + deleted_documents: Some(deleted_documents), + edited_documents: Some(edited_documents), + }); + } + Err(e) => { + task.status = Status::Failed; + task.details = Some(Details::DocumentEdition { + original_filter, + context, + function, + deleted_documents: Some(0), + edited_documents: Some(0), + }); + task.error = Some(e.into()); + } + } Ok(vec![task]) } diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index d9b09bd21..d2e5126d0 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -113,16 +113,15 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { scope.push_constant_dynamic("context", context.clone()); } scope.push("doc", rhai_document); - // That's were the magic happens. We run the user script - // which edits "doc" scope variable reprensenting the document - // and ignore the output and even the type of it, i.e., Dynamic. + // We run the user script which edits "doc" scope variable reprensenting + // the document and ignore the output and even the type of it, i.e., Dynamic. let _ = self .engine .eval_ast_with_scope::(&mut scope, &self.ast) .map_err(UserError::DocumentEditionRuntimeError)?; match scope.remove::("doc") { - // If the "doc" variable has set to (), we effectively delete the document. + // If the "doc" variable has been set to (), we effectively delete the document. Some(doc) if doc.is_unit() => Ok(Some(DocumentChange::Deletion(Deletion::create( docid, doc_alloc.alloc_str(&document_id), @@ -142,7 +141,7 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { // // Future: Use a custom function rhai function to track changes. // - if json_document != rhaimap_to_object(new_rhai_document) { + if dbg!(json_document) != dbg!(rhaimap_to_object(new_rhai_document)) { let mut global_fields_ids_map = new_fields_ids_map.borrow_mut(); let new_document_id = self .primary_key From 1e81d72b5fc4624d6dc7c7eeb54dc3fed55a5698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 14 Oct 2024 18:18:59 +0200 Subject: [PATCH 144/247] Use the fixed version of the Rhai crate --- Cargo.lock | 8 ++-- index-scheduler/src/batch.rs | 43 +------------------ milli/Cargo.toml | 2 +- .../update/new/indexer/update_by_function.rs | 2 +- 4 files changed, 6 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 961ebab28..5cd1f3976 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4618,9 +4618,8 @@ dependencies = [ [[package]] name = "rhai" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61797318be89b1a268a018a92a7657096d83f3ecb31418b9e9c16dcbb043b702" +version = "1.20.0" +source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" dependencies = [ "ahash 0.8.11", "bitflags 2.6.0", @@ -4637,8 +4636,7 @@ dependencies = [ [[package]] name = "rhai_codegen" version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5a11a05ee1ce44058fa3d5961d05194fdbe3ad6b40f904af764d81b86450e6b" +source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" dependencies = [ "proc-macro2", "quote", diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 978ef9ec6..328a5aed7 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -1355,7 +1355,7 @@ impl IndexScheduler { } } - if !tasks.iter().all(|res| res.error.is_some()) { + if tasks.iter().any(|res| res.error.is_none()) { /// TODO create a pool if needed // let pool = indexer_config.thread_pool.unwrap(); let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); @@ -1791,44 +1791,3 @@ impl IndexScheduler { Ok(content_files_to_delete) } } - -fn edit_documents_by_function<'a>( - wtxn: &mut RwTxn<'a>, - filter: &Option, - context: Option, - code: &str, - indexer_config: &IndexerConfig, - must_stop_processing: MustStopProcessing, - index: &'a Index, -) -> Result<(u64, u64)> { - let candidates = match filter.as_ref().map(Filter::from_json) { - Some(Ok(Some(filter))) => filter.evaluate(wtxn, index).map_err(|err| match err { - milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { - Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter) - } - e => e.into(), - })?, - None | Some(Ok(None)) => index.documents_ids(wtxn)?, - Some(Err(e)) => return Err(e.into()), - }; - - let config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }; - - let mut builder = milli::update::IndexDocuments::new( - wtxn, - index, - indexer_config, - config, - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.get(), - )?; - - let (new_builder, count) = builder.edit_documents(&candidates, context, code)?; - builder = new_builder; - - let _ = builder.execute()?; - Ok(count.unwrap()) -} diff --git a/milli/Cargo.toml b/milli/Cargo.toml index fc522994e..46633bdec 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -81,7 +81,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.9" liquid = "0.26.6" -rhai = { version = "1.19.0", features = [ +rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = [ "serde", "no_module", "no_custom_syntax", diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index d2e5126d0..6bde29b45 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -141,7 +141,7 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { // // Future: Use a custom function rhai function to track changes. // - if dbg!(json_document) != dbg!(rhaimap_to_object(new_rhai_document)) { + if json_document != rhaimap_to_object(new_rhai_document) { let mut global_fields_ids_map = new_fields_ids_map.borrow_mut(); let new_document_id = self .primary_key From 9a0e1dc37501f927aacb1337da33a0ec01659d33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 15 Oct 2024 11:20:09 +0200 Subject: [PATCH 145/247] Fix the prefix deletion --- .../update/new/indexer/update_by_function.rs | 29 ------------------- milli/src/update/new/words_prefix_docids.rs | 7 +++-- 2 files changed, 4 insertions(+), 32 deletions(-) diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index 6bde29b45..6f2914577 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -204,32 +204,3 @@ fn rhaimap_to_object(map: rhai::Map) -> Object { } output } - -fn rhaimap_to_obkv( - map: rhai::Map, - global_fields_ids_map: &mut GlobalFieldsIdsMap, - buffer: &mut Vec, -) -> Result> { - let result: Result> = map - .keys() - .map(|key| { - global_fields_ids_map - .id_or_insert(key) - .ok_or(UserError::AttributeLimitReached) - .map_err(Error::from) - .map(|fid| (fid, key)) - }) - .collect(); - - let ordered_fields = result?; - let mut writer = KvWriterFieldId::memory(); - for (fid, key) in ordered_fields { - let value = map.get(key).unwrap(); - let value = serde_json::to_value(value).unwrap(); - buffer.clear(); - serde_json::to_writer(&mut *buffer, &value).unwrap(); - writer.insert(fid, &buffer)?; - } - - Ok(writer.into_boxed()) -} diff --git a/milli/src/update/new/words_prefix_docids.rs b/milli/src/update/new/words_prefix_docids.rs index d45f6397e..38c2b1744 100644 --- a/milli/src/update/new/words_prefix_docids.rs +++ b/milli/src/update/new/words_prefix_docids.rs @@ -128,9 +128,10 @@ fn delete_prefixes( ) -> Result<()> { // We remove all the entries that are no more required in this word prefix docids database. for prefix in prefixes { - let prefix = prefix.as_bytes(); - if !prefix_database.delete(wtxn, prefix)? { - unreachable!("We tried to delete an unknown key") + let mut iter = prefix_database.prefix_iter_mut(wtxn, prefix.as_bytes())?; + while iter.next().transpose()?.is_some() { + // safety: we do not keep a reference on database entries. + unsafe { iter.del_current()? }; } } From c283c95f6ab6ca5e0ff4ed48ae5c239f0b0d90e0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 15 Oct 2024 14:08:16 +0200 Subject: [PATCH 146/247] Support nested primary keys --- milli/src/documents/primary_key.rs | 35 ++- milli/src/update/new/indexer/de.rs | 337 ++++++++++++++++------------- 2 files changed, 215 insertions(+), 157 deletions(-) diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs index accb270c9..904109033 100644 --- a/milli/src/documents/primary_key.rs +++ b/milli/src/documents/primary_key.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::iter; +use std::ops::ControlFlow; use std::result::Result as StdResult; use bumpalo::Bump; @@ -7,7 +8,7 @@ use serde_json::value::RawValue; use serde_json::{from_str, Value}; use crate::fields_ids_map::MutFieldIdMapper; -use crate::update::new::indexer::de::DeOrBumpStr; +use crate::update::new::indexer::de::{match_component, DeOrBumpStr}; use crate::update::new::{CowStr, KvReaderFieldId, TopLevelMap}; use crate::{FieldId, InternalError, Object, Result, UserError}; @@ -64,7 +65,7 @@ impl<'a> PrimaryKey<'a> { }) } - pub fn name(&self) -> &str { + pub fn name(&self) -> &'a str { match self { PrimaryKey::Flat { name, .. } => name, PrimaryKey::Nested { name } => name, @@ -154,7 +155,31 @@ impl<'a> PrimaryKey<'a> { Ok(external_document_id) } - PrimaryKey::Nested { name } => todo!(), + nested @ PrimaryKey::Nested { name: _ } => { + let mut docid = None; + for (first_level, right) in nested.possible_level_names() { + let Some(fid) = db_fields_ids_map.id(first_level) else { continue }; + + let Some(value) = document.get(fid) else { continue }; + let value: &RawValue = + serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; + match match_component(first_level, right, value, indexer, &mut docid) { + ControlFlow::Continue(()) => continue, + ControlFlow::Break(Ok(_)) => { + return Err(InternalError::DocumentsError( + crate::documents::Error::InvalidDocumentFormat, + ) + .into()) + } + ControlFlow::Break(Err(err)) => { + return Err(InternalError::SerdeJson(err).into()) + } + } + } + Ok(docid.ok_or(InternalError::DocumentsError( + crate::documents::Error::InvalidDocumentFormat, + ))?) + } } } @@ -171,7 +196,7 @@ impl<'a> PrimaryKey<'a> { self, indexer, )) - .map_err(UserError::SerdeJson)?; + .map_err(UserError::SerdeJson)??; let external_document_id = match res { Ok(document_id) => Ok(document_id), @@ -234,7 +259,7 @@ impl<'a> PrimaryKey<'a> { /// Returns an `Iterator` that gives all the possible fields names the primary key /// can have depending of the first level name and depth of the objects. - pub fn possible_level_names(&self) -> impl Iterator + '_ { + pub fn possible_level_names(&self) -> impl Iterator + '_ { let name = self.name(); name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) diff --git a/milli/src/update/new/indexer/de.rs b/milli/src/update/new/indexer/de.rs index 9a664b5f8..fa6b5fa76 100644 --- a/milli/src/update/new/indexer/de.rs +++ b/milli/src/update/new/indexer/de.rs @@ -1,4 +1,7 @@ +use std::ops::ControlFlow; + use bumpalo::Bump; +use serde::de::{DeserializeSeed, Deserializer as _, Visitor}; use serde_json::value::RawValue; use crate::documents::{ @@ -14,22 +17,6 @@ pub struct FieldAndDocidExtractor<'p, 'indexer, Mapper: MutFieldIdMapper> { indexer: &'indexer Bump, } -pub struct DocidExtractor<'p, 'indexer, Mapper: FieldIdMapper> { - fields_ids_map: &'p Mapper, - primary_key: &'p PrimaryKey<'p>, - indexer: &'indexer Bump, -} - -impl<'p, 'indexer, Mapper: FieldIdMapper> DocidExtractor<'p, 'indexer, Mapper> { - pub fn new( - fields_ids_map: &'p Mapper, - primary_key: &'p PrimaryKey<'p>, - indexer: &'indexer Bump, - ) -> Self { - Self { fields_ids_map, primary_key, indexer } - } -} - impl<'p, 'indexer, Mapper: MutFieldIdMapper> FieldAndDocidExtractor<'p, 'indexer, Mapper> { pub fn new( fields_ids_map: &'p mut Mapper, @@ -40,63 +27,56 @@ impl<'p, 'indexer, Mapper: MutFieldIdMapper> FieldAndDocidExtractor<'p, 'indexer } } -impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> +impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> Visitor<'de> for FieldAndDocidExtractor<'p, 'indexer, Mapper> { - type Value = std::result::Result, DocumentIdExtractionError>; + type Value = + Result, DocumentIdExtractionError>, crate::UserError>; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { write!(formatter, "a map") } - fn visit_map(mut self, mut map: A) -> std::result::Result + fn visit_map(mut self, mut map: A) -> Result where A: serde::de::MapAccess<'de>, { let mut docid = None; - while let Some((fid, fields_ids_map)) = - map.next_key_seed(MutFieldIdMapSeed(self.fields_ids_map))? - { - use serde::de::Deserializer as _; - self.fields_ids_map = fields_ids_map; - /// FIXME unwrap => too many fields - let fid = fid.unwrap(); - match self.primary_key { - PrimaryKey::Flat { name: _, field_id } => { - let value: &'de RawValue = map.next_value()?; - if fid == *field_id { - let value = match value - .deserialize_any(DocumentIdVisitor(self.indexer)) - .map_err(|_err| { - DocumentIdExtractionError::InvalidDocumentId( - UserError::InvalidDocumentId { - document_id: serde_json::to_value(value).unwrap(), - }, - ) - }) { - Ok(Ok(value)) => value, - Ok(Err(err)) | Err(err) => return Ok(Err(err)), - }; - if let Some(_previous_value) = docid.replace(value) { - return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(2))); - } - } - } - PrimaryKey::Nested { name } => todo!(), + while let Some(((level_name, right), (fid, fields_ids_map))) = + map.next_key_seed(ComponentsSeed { + name: self.primary_key.name(), + visitor: MutFieldIdMapVisitor(self.fields_ids_map), + })? + { + let Some(fid) = fid else { + return Ok(Err(crate::UserError::AttributeLimitReached)); + }; + self.fields_ids_map = fields_ids_map; + + let value: &'de RawValue = map.next_value()?; + + match match_component(level_name, right, value, self.indexer, &mut docid) { + ControlFlow::Continue(()) => continue, + ControlFlow::Break(Err(err)) => return Err(serde::de::Error::custom(err)), + ControlFlow::Break(Ok(err)) => return Ok(Ok(Err(err))), } } - Ok(match docid { + + Ok(Ok(match docid { Some(docid) => Ok(docid), None => Err(DocumentIdExtractionError::MissingDocumentId), - }) + })) } } -impl<'de, 'p, 'indexer: 'de, Mapper: FieldIdMapper> serde::de::Visitor<'de> - for DocidExtractor<'p, 'indexer, Mapper> -{ - type Value = std::result::Result, DocumentIdExtractionError>; +struct NestedPrimaryKeyVisitor<'a, 'bump> { + components: &'a str, + bump: &'bump Bump, +} + +impl<'de, 'a, 'bump: 'de> Visitor<'de> for NestedPrimaryKeyVisitor<'a, 'bump> { + type Value = std::result::Result>, DocumentIdExtractionError>; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { write!(formatter, "a map") @@ -107,142 +87,156 @@ impl<'de, 'p, 'indexer: 'de, Mapper: FieldIdMapper> serde::de::Visitor<'de> A: serde::de::MapAccess<'de>, { let mut docid = None; - while let Some(fid) = map.next_key_seed(FieldIdMapSeed(self.fields_ids_map))? { - use serde::de::Deserializer as _; + while let Some(((matched_component, right), _)) = map.next_key_seed(ComponentsSeed { + name: self.components, + visitor: serde::de::IgnoredAny, + })? { + let value: &'de RawValue = map.next_value()?; - let Some(fid) = fid else { - continue; - }; - - match self.primary_key { - PrimaryKey::Flat { name: _, field_id } => { - let value: &'de RawValue = map.next_value()?; - if fid == *field_id { - let value = match value - .deserialize_any(DocumentIdVisitor(self.indexer)) - .map_err(|_err| { - DocumentIdExtractionError::InvalidDocumentId( - UserError::InvalidDocumentId { - document_id: serde_json::to_value(value).unwrap(), - }, - ) - }) { - Ok(Ok(value)) => value, - Ok(Err(err)) | Err(err) => return Ok(Err(err)), - }; - if let Some(_previous_value) = docid.replace(value) { - return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(2))); - } - } - } - PrimaryKey::Nested { name } => todo!(), + match match_component(matched_component, right, value, self.bump, &mut docid) { + ControlFlow::Continue(()) => continue, + ControlFlow::Break(Err(err)) => return Err(serde::de::Error::custom(err)), + ControlFlow::Break(Ok(err)) => return Ok(Err(err)), } } - Ok(match docid { - Some(docid) => Ok(docid), - None => Err(DocumentIdExtractionError::MissingDocumentId), - }) + Ok(Ok(docid)) } } +/// Either a `&'de str` or a `&'bump str`. pub enum DeOrBumpStr<'de, 'bump: 'de> { + /// Lifetime of the deserializer De(&'de str), + /// Lifetime of the allocator Bump(&'bump str), } impl<'de, 'bump: 'de> DeOrBumpStr<'de, 'bump> { + /// Returns a `&'bump str`, possibly allocating to extend its lifetime. pub fn to_bump(&self, bump: &'bump Bump) -> &'bump str { match self { DeOrBumpStr::De(de) => bump.alloc_str(de), - DeOrBumpStr::Bump(bump) => *bump, + DeOrBumpStr::Bump(bump) => bump, } } + /// Returns a `&'de str`. + /// + /// This function never allocates because `'bump: 'de`. pub fn to_de(&self) -> &'de str { match self { - DeOrBumpStr::De(de) => *de, - DeOrBumpStr::Bump(bump) => *bump, + DeOrBumpStr::De(de) => de, + DeOrBumpStr::Bump(bump) => bump, } } } -struct MutFieldIdMapSeed<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper); +struct ComponentsSeed<'a, V> { + name: &'a str, + visitor: V, +} -impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::DeserializeSeed<'de> - for MutFieldIdMapSeed<'a, Mapper> -{ +impl<'de, 'a, V: Visitor<'de>> DeserializeSeed<'de> for ComponentsSeed<'a, V> { + type Value = ((&'a str, &'a str), V::Value); + + fn deserialize(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct ComponentsSeedVisitor<'a, V> { + name: &'a str, + visitor: V, + } + + impl<'a, V> ComponentsSeedVisitor<'a, V> { + fn match_str(&self, v: &str) -> (&'a str, &'a str) { + let p = PrimaryKey::Nested { name: self.name }; + for (name, right) in p.possible_level_names() { + if name == v { + return (name, right); + } + } + ("", self.name) + } + } + + impl<'de, 'a, V: Visitor<'de>> Visitor<'de> for ComponentsSeedVisitor<'a, V> { + type Value = ((&'a str, &'a str), V::Value); + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "expecting a string") + } + fn visit_borrowed_str(self, v: &'de str) -> std::result::Result + where + E: serde::de::Error, + { + let matched = self.match_str(v); + let inner = self.visitor.visit_borrowed_str(v)?; + Ok((matched, inner)) + } + + fn visit_str(self, v: &str) -> std::result::Result + where + E: serde::de::Error, + { + let matched = self.match_str(v); + let inner = self.visitor.visit_str(v)?; + + Ok((matched, inner)) + } + } + deserializer + .deserialize_str(ComponentsSeedVisitor { name: self.name, visitor: self.visitor }) + } +} + +struct MutFieldIdMapVisitor<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper); + +impl<'de, 'a, Mapper: MutFieldIdMapper> Visitor<'de> for MutFieldIdMapVisitor<'a, Mapper> { type Value = (Option, &'a mut Mapper); - fn deserialize(self, deserializer: D) -> std::result::Result + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "expecting a string") + } + fn visit_borrowed_str(self, v: &'de str) -> std::result::Result where - D: serde::Deserializer<'de>, + E: serde::de::Error, { - struct MutFieldIdMapVisitor<'a, Mapper: MutFieldIdMapper>(&'a mut Mapper); - impl<'de, 'a, Mapper: MutFieldIdMapper> serde::de::Visitor<'de> - for MutFieldIdMapVisitor<'a, Mapper> - { - type Value = (Option, &'a mut Mapper); + Ok((self.0.insert(v), self.0)) + } - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(formatter, "expecting a string") - } - fn visit_borrowed_str(self, v: &'de str) -> std::result::Result - where - E: serde::de::Error, - { - Ok((self.0.insert(v), self.0)) - } - - fn visit_str(self, v: &str) -> std::result::Result - where - E: serde::de::Error, - { - Ok((self.0.insert(v), self.0)) - } - } - deserializer.deserialize_str(MutFieldIdMapVisitor(self.0)) + fn visit_str(self, v: &str) -> std::result::Result + where + E: serde::de::Error, + { + Ok((self.0.insert(v), self.0)) } } -struct FieldIdMapSeed<'a, Mapper: FieldIdMapper>(&'a Mapper); +pub struct FieldIdMapVisitor<'a, Mapper: FieldIdMapper>(pub &'a Mapper); -impl<'de, 'a, Mapper: FieldIdMapper> serde::de::DeserializeSeed<'de> - for FieldIdMapSeed<'a, Mapper> -{ +impl<'de, 'a, Mapper: FieldIdMapper> Visitor<'de> for FieldIdMapVisitor<'a, Mapper> { type Value = Option; - fn deserialize(self, deserializer: D) -> std::result::Result + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "expecting a string") + } + fn visit_borrowed_str(self, v: &'de str) -> std::result::Result where - D: serde::Deserializer<'de>, + E: serde::de::Error, { - struct FieldIdMapVisitor<'a, Mapper: FieldIdMapper>(&'a Mapper); - impl<'de, 'a, Mapper: FieldIdMapper> serde::de::Visitor<'de> for FieldIdMapVisitor<'a, Mapper> { - type Value = Option; + Ok(self.0.id(v)) + } - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(formatter, "expecting a string") - } - fn visit_borrowed_str(self, v: &'de str) -> std::result::Result - where - E: serde::de::Error, - { - Ok(self.0.id(v)) - } - - fn visit_str(self, v: &str) -> std::result::Result - where - E: serde::de::Error, - { - Ok(self.0.id(v)) - } - } - deserializer.deserialize_str(FieldIdMapVisitor(self.0)) + fn visit_str(self, v: &str) -> std::result::Result + where + E: serde::de::Error, + { + Ok(self.0.id(v)) } } - pub struct DocumentIdVisitor<'indexer>(pub &'indexer Bump); -impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> { +impl<'de, 'indexer: 'de> Visitor<'de> for DocumentIdVisitor<'indexer> { type Value = std::result::Result, DocumentIdExtractionError>; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { @@ -262,13 +256,15 @@ impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> .map(DeOrBumpStr::De)) } - fn visit_str(self, v: &str) -> std::result::Result + fn visit_str(self, v: &str) -> Result where E: serde::de::Error, { let v = self.0.alloc_str(v); - self.visit_borrowed_str(v)?; - Ok(Ok(DeOrBumpStr::Bump(v))) + Ok(match self.visit_borrowed_str(v)? { + Ok(_) => Ok(DeOrBumpStr::Bump(v)), + Err(err) => Err(err), + }) } fn visit_u64(self, v: u64) -> std::result::Result @@ -288,8 +284,45 @@ impl<'de, 'indexer: 'de> serde::de::Visitor<'de> for DocumentIdVisitor<'indexer> { use std::fmt::Write as _; - let mut out = bumpalo::collections::String::new_in(&self.0); - write!(&mut out, "{v}"); + let mut out = bumpalo::collections::String::new_in(self.0); + write!(&mut out, "{v}").unwrap(); Ok(Ok(DeOrBumpStr::Bump(out.into_bump_str()))) } } + +pub fn match_component<'de, 'indexer: 'de>( + first_level_name: &str, + right: &str, + value: &'de RawValue, + bump: &'indexer Bump, + docid: &mut Option>, +) -> ControlFlow, ()> { + if first_level_name.is_empty() { + return ControlFlow::Continue(()); + } + + let value = if right.is_empty() { + match value.deserialize_any(DocumentIdVisitor(bump)).map_err(|_err| { + DocumentIdExtractionError::InvalidDocumentId(UserError::InvalidDocumentId { + document_id: serde_json::to_value(value).unwrap(), + }) + }) { + Ok(Ok(value)) => value, + Ok(Err(err)) | Err(err) => return ControlFlow::Break(Ok(err)), + } + } else { + // if right is not empty, recursively extract right components from value + let res = value.deserialize_map(NestedPrimaryKeyVisitor { components: right, bump }); + match res { + Ok(Ok(Some(value))) => value, + Ok(Ok(None)) => return ControlFlow::Continue(()), + Ok(Err(err)) => return ControlFlow::Break(Ok(err)), + Err(err) if err.is_data() => return ControlFlow::Continue(()), // we expected the field to be a map, but it was not and that's OK. + Err(err) => return ControlFlow::Break(Err(err)), + } + }; + if let Some(_previous_value) = docid.replace(value) { + return ControlFlow::Break(Ok(DocumentIdExtractionError::TooManyDocumentIds(2))); + } + ControlFlow::Continue(()) +} From 152683083b20b8ef388f7409f79151ccd1e30d25 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 15 Oct 2024 14:08:24 +0200 Subject: [PATCH 147/247] Change document operation to use method in primary key --- .../update/new/indexer/document_operation.rs | 35 ++++--------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index fcab6773a..57ec46a41 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -8,13 +8,12 @@ use IndexDocumentsMethod as Idm; use super::super::document_change::DocumentChange; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; -use crate::documents::{DocumentIdExtractionError, PrimaryKey}; +use crate::documents::PrimaryKey; use crate::update::new::document::DocumentFromVersions; use crate::update::new::document_change::Versions; -use crate::update::new::indexer::de::FieldAndDocidExtractor; use crate::update::new::{Deletion, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; -use crate::{external_documents_ids, DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; +use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; pub struct DocumentOperation<'pl> { operations: Vec>, @@ -77,7 +76,6 @@ impl<'pl> DocumentOperation<'pl> { primary_key: &PrimaryKey, new_fields_ids_map: &mut FieldsIdsMap, ) -> Result> { - use serde::de::Deserializer; // will contain nodes from the intermediate hashmap let document_changes_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1 MiB @@ -97,30 +95,11 @@ impl<'pl> DocumentOperation<'pl> { while let Some(document) = iter.next().transpose().map_err(UserError::SerdeJson)? { - let res = document - .deserialize_map(FieldAndDocidExtractor::new( - new_fields_ids_map, - primary_key, - indexer, - )) - .map_err(UserError::SerdeJson)?; - - let external_document_id = match res { - Ok(document_id) => Ok(document_id), - Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e), - Err(DocumentIdExtractionError::MissingDocumentId) => { - Err(UserError::MissingDocumentId { - primary_key: primary_key.name().to_string(), - document: serde_json::from_str(document.get()).unwrap(), - }) - } - Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { - Err(UserError::TooManyDocumentIds { - primary_key: primary_key.name().to_string(), - document: serde_json::from_str(document.get()).unwrap(), - }) - } - }?; + let external_document_id = primary_key.extract_fields_and_docid( + document, + new_fields_ids_map, + indexer, + )?; let external_document_id = external_document_id.to_de(); From 017757004eed25d1b6a94d6310e07193a5991577 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 16 Oct 2024 09:26:18 +0200 Subject: [PATCH 148/247] Add PrimaryKey::new_or_insert --- milli/src/documents/primary_key.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs index 904109033..79fd07048 100644 --- a/milli/src/documents/primary_key.rs +++ b/milli/src/documents/primary_key.rs @@ -65,6 +65,18 @@ impl<'a> PrimaryKey<'a> { }) } + pub fn new_or_insert( + path: &'a str, + fields: &mut impl MutFieldIdMapper, + ) -> StdResult { + Ok(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) { + Self::Nested { name: path } + } else { + let field_id = fields.insert(path).ok_or(UserError::AttributeLimitReached)?; + Self::Flat { name: path, field_id } + }) + } + pub fn name(&self) -> &'a str { match self { PrimaryKey::Flat { name, .. } => name, From f9a6c624a715c66ea0e79efb68c80505b6e22338 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 16 Oct 2024 09:27:00 +0200 Subject: [PATCH 149/247] Put primary key, and use provided key in operation --- milli/src/update/new/indexer/mod.rs | 78 +++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 4f56b52b1..5f0face8b 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -106,6 +106,7 @@ pub fn index<'pl, 'indexer, 'index, DC>( index: &'index Index, db_fields_ids_map: &'indexer FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, + new_primary_key: Option>, pool: &ThreadPool, document_changes: &DC, ) -> Result<()> @@ -282,6 +283,10 @@ where let fields_ids_map = new_fields_ids_map.into_inner().unwrap(); index.put_fields_ids_map(wtxn, &fields_ids_map)?; + if let Some(new_primary_key) = new_primary_key { + index.put_primary_key(wtxn, new_primary_key.name())?; + } + // used to update the localized and weighted maps while sharing the update code with the settings pipeline. let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn)?; inner_index_settings.recompute_facets(wtxn, index)?; @@ -365,23 +370,48 @@ fn extract_and_send_docids< Ok(()) } -/// Returns the primary key *field id* that has already been set for this index or the -/// one we will guess by searching for the first key that contains "id" as a substring. +/// Returns the primary key that has already been set for this index or the +/// one we will guess by searching for the first key that contains "id" as a substring, +/// and whether the primary key changed /// TODO move this elsewhere pub fn retrieve_or_guess_primary_key<'a>( rtxn: &'a RoTxn<'a>, index: &Index, - fields_ids_map: &mut FieldsIdsMap, - first_document: Option<&'a TopLevelMap<'_>>, -) -> Result, UserError>> { - match index.primary_key(rtxn)? { - Some(primary_key) => match PrimaryKey::new(primary_key, fields_ids_map) { - Some(primary_key) => Ok(Ok(primary_key)), - None => unreachable!("Why is the primary key not in the fidmap?"), - }, - None => { + new_fields_ids_map: &mut FieldsIdsMap, + primary_key_from_op: Option<&'a str>, + first_document: Option<&'a TopLevelMap<'a>>, +) -> Result, bool), UserError>> { + // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. + + // do we have an existing declared primary key? + let (primary_key, has_changed) = if let Some(primary_key_from_db) = index.primary_key(rtxn)? { + // did we request a primary key in the operation? + match primary_key_from_op { + // we did, and it is different from the DB one + Some(primary_key_from_op) if primary_key_from_op != primary_key_from_db => { + // is the index empty? + if index.number_of_documents(rtxn)? == 0 { + // change primary key + (primary_key_from_op, true) + } else { + return Ok(Err(UserError::PrimaryKeyCannotBeChanged( + primary_key_from_db.to_string(), + ))); + } + } + _ => (primary_key_from_db, false), + } + } else { + // no primary key in the DB => let's set one + // did we request a primary key in the operation? + let primary_key = if let Some(primary_key_from_op) = primary_key_from_op { + // set primary key from operation + primary_key_from_op + } else { + // guess primary key let first_document = match first_document { Some(document) => document, + // previous indexer when no pk is set + we send an empty payload => index_primary_key_no_candidate_found None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), }; @@ -395,18 +425,26 @@ pub fn retrieve_or_guess_primary_key<'a>( guesses.sort_unstable(); match guesses.as_slice() { - [] => Ok(Err(UserError::NoPrimaryKeyCandidateFound)), + [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), [name] => { tracing::info!("Primary key was not specified in index. Inferred to '{name}'"); - match fields_ids_map.insert(name) { - Some(field_id) => Ok(Ok(PrimaryKey::Flat { name, field_id })), - None => Ok(Err(UserError::AttributeLimitReached)), - } + *name + } + multiple => { + return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { + candidates: multiple + .iter() + .map(|candidate| candidate.to_string()) + .collect(), + })) } - multiple => Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { - candidates: multiple.iter().map(|candidate| candidate.to_string()).collect(), - })), } - } + }; + (primary_key, true) + }; + + match PrimaryKey::new_or_insert(primary_key, new_fields_ids_map) { + Ok(primary_key) => Ok(Ok((primary_key, has_changed))), + Err(err) => Ok(Err(err)), } } From 198238687fcbfbd6ce3a247f54d41a1b820180d7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 16 Oct 2024 09:27:18 +0200 Subject: [PATCH 150/247] Guess and retrieve primary key correctly in batch --- index-scheduler/src/batch.rs | 72 ++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 328a5aed7..14bbcfe53 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -28,7 +28,7 @@ use bumpalo::Bump; use dump::IndexMetadata; use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; -use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; +use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::new::indexer::{ self, retrieve_or_guess_primary_key, UpdateByFunction, @@ -1253,7 +1253,6 @@ impl IndexScheduler { mut tasks, } => { let started_processing_at = std::time::Instant::now(); - let primary_key_has_been_set = false; let must_stop_processing = self.must_stop_processing.clone(); let indexer_config = self.index_mapper.indexer_config(); // TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches. @@ -1261,7 +1260,6 @@ impl IndexScheduler { // to a fresh thread. /// TODO manage errors correctly - let rtxn = index.read_txn()?; let first_addition_uuid = operations .iter() .find_map(|op| match op { @@ -1281,6 +1279,7 @@ impl IndexScheduler { } } + let rtxn = index.read_txn()?; let db_fields_ids_map = index.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); @@ -1292,13 +1291,14 @@ impl IndexScheduler { None => None, }; - let primary_key = retrieve_or_guess_primary_key( + let (primary_key, primary_key_has_been_set) = retrieve_or_guess_primary_key( &rtxn, index, &mut new_fields_ids_map, + primary_key.as_deref(), first_document.as_ref(), )? - .unwrap(); + .map_err(milli::Error::from)?; let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(method); @@ -1373,6 +1373,7 @@ impl IndexScheduler { index, &db_fields_ids_map, new_fields_ids_map, + primary_key_has_been_set.then_some(primary_key), &pool, &document_changes, )?; @@ -1417,12 +1418,40 @@ impl IndexScheduler { Some(Err(e)) => return Err(e.into()), }; + let (original_filter, context, function) = if let Some(Details::DocumentEdition { + original_filter, + context, + function, + .. + }) = task.details + { + (original_filter, context, function) + } else { + // In the case of a `documentEdition` the details MUST be set + unreachable!(); + }; + + if candidates.is_empty() { + task.status = Status::Succeeded; + task.details = Some(Details::DocumentEdition { + original_filter, + context, + function, + deleted_documents: Some(0), + edited_documents: Some(0), + }); + + return Ok(vec![task]); + } + let rtxn = index.read_txn()?; let db_fields_ids_map = index.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); - let primary_key = - retrieve_or_guess_primary_key(&rtxn, index, &mut new_fields_ids_map, None)? - .unwrap(); + // candidates not empty => index not empty => a primary key is set + let primary_key = index.primary_key(&rtxn)?.unwrap(); + + let primary_key = PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) + .map_err(milli::Error::from)?; let result_count = Ok((candidates.len(), candidates.len())) as Result<_>; @@ -1439,6 +1468,7 @@ impl IndexScheduler { index, &db_fields_ids_map, new_fields_ids_map, + None, // cannot change primary key in DocumentEdition &pool, &document_changes, )?; @@ -1446,19 +1476,6 @@ impl IndexScheduler { // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } - let (original_filter, context, function) = if let Some(Details::DocumentEdition { - original_filter, - context, - function, - .. - }) = task.details - { - (original_filter, context, function) - } else { - // In the case of a `documentEdition` the details MUST be set - unreachable!(); - }; - match result_count { Ok((deleted_documents, edited_documents)) => { task.status = Status::Succeeded; @@ -1559,13 +1576,19 @@ impl IndexScheduler { } } + if to_delete.is_empty() { + return Ok(tasks); + } + let rtxn = index.read_txn()?; let db_fields_ids_map = index.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); - let primary_key = - retrieve_or_guess_primary_key(&rtxn, index, &mut new_fields_ids_map, None)? - .unwrap(); + // to_delete not empty => index not empty => primary key set + let primary_key = index.primary_key(&rtxn)?.unwrap(); + + let primary_key = PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) + .map_err(milli::Error::from)?; if !tasks.iter().all(|res| res.error.is_some()) { /// TODO create a pool if needed @@ -1581,6 +1604,7 @@ impl IndexScheduler { index, &db_fields_ids_map, new_fields_ids_map, + None, // document deletion never changes primary key &pool, &document_changes, )?; From c75de1f39151cf88f996e343cdea899bd460fee9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 16 Oct 2024 11:18:59 +0200 Subject: [PATCH 151/247] Remove TODO --- milli/src/update/new/indexer/document_deletion.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index 7744bcf18..a9628f419 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -73,8 +73,6 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { } } -// TODO: implement Allocator for Ref<'bump, Bump> - #[cfg(test)] mod test { use std::cell::RefCell; From 86a0097311803394fc6a5bc01b820b617deaf5bc Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 15 Oct 2024 12:04:20 +0200 Subject: [PATCH 152/247] Use bumpalo in word docids --- .../extract/searchable/extract_word_docids.rs | 58 +++++++++++-------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index fd74cc8ce..66f7ae5e8 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -1,9 +1,11 @@ use std::cell::RefCell; use std::collections::HashMap; use std::fs::File; +use std::mem::size_of; use std::num::NonZero; use std::ops::DerefMut as _; +use bumpalo::collections::vec::Vec as BumpVec; use bumpalo::Bump; use grenad::{Merger, MergerBuilder}; use heed::RoTxn; @@ -113,30 +115,33 @@ impl WordDocidsCachedSorters { word: &str, exact: bool, docid: u32, - buffer: &mut Vec, + bump: &Bump, ) -> Result<()> { - let key = word.as_bytes(); + let word_bytes = word.as_bytes(); if exact { - self.exact_word_docids.insert_add_u32(key, docid)?; + self.exact_word_docids.insert_add_u32(word_bytes, docid)?; } else { - self.word_docids.insert_add_u32(key, docid)?; + self.word_docids.insert_add_u32(word_bytes, docid)?; } + let buffer_size = word_bytes.len() + 1 + size_of::(); + let mut buffer = BumpVec::with_capacity_in(buffer_size, bump); + buffer.clear(); - buffer.extend_from_slice(word.as_bytes()); + buffer.extend_from_slice(word_bytes); buffer.push(0); buffer.extend_from_slice(&field_id.to_be_bytes()); - self.word_fid_docids.insert_add_u32(buffer, docid)?; + self.word_fid_docids.insert_add_u32(&buffer, docid)?; let position = bucketed_position(position); buffer.clear(); - buffer.extend_from_slice(word.as_bytes()); + buffer.extend_from_slice(word_bytes); buffer.push(0); buffer.extend_from_slice(&position.to_be_bytes()); - self.word_position_docids.insert_add_u32(buffer, docid)?; + self.word_position_docids.insert_add_u32(&buffer, docid)?; if self.current_docid.map_or(false, |id| docid != id) { - self.flush_fid_word_count(buffer)?; + self.flush_fid_word_count(&mut buffer)?; } self.fid_word_count @@ -155,30 +160,33 @@ impl WordDocidsCachedSorters { word: &str, exact: bool, docid: u32, - buffer: &mut Vec, + bump: &Bump, ) -> Result<()> { - let key = word.as_bytes(); + let word_bytes = word.as_bytes(); if exact { - self.exact_word_docids.insert_del_u32(key, docid)?; + self.exact_word_docids.insert_del_u32(word_bytes, docid)?; } else { - self.word_docids.insert_del_u32(key, docid)?; + self.word_docids.insert_del_u32(word_bytes, docid)?; } + let buffer_size = word_bytes.len() + 1 + size_of::(); + let mut buffer = BumpVec::with_capacity_in(buffer_size, bump); + buffer.clear(); - buffer.extend_from_slice(word.as_bytes()); + buffer.extend_from_slice(word_bytes); buffer.push(0); buffer.extend_from_slice(&field_id.to_be_bytes()); - self.word_fid_docids.insert_del_u32(buffer, docid)?; + self.word_fid_docids.insert_del_u32(&buffer, docid)?; let position = bucketed_position(position); buffer.clear(); - buffer.extend_from_slice(word.as_bytes()); + buffer.extend_from_slice(word_bytes); buffer.push(0); buffer.extend_from_slice(&position.to_be_bytes()); - self.word_position_docids.insert_del_u32(buffer, docid)?; + self.word_position_docids.insert_del_u32(&buffer, docid)?; if self.current_docid.map_or(false, |id| docid != id) { - self.flush_fid_word_count(buffer)?; + self.flush_fid_word_count(&mut buffer)?; } self.fid_word_count @@ -190,7 +198,7 @@ impl WordDocidsCachedSorters { Ok(()) } - fn flush_fid_word_count(&mut self, buffer: &mut Vec) -> Result<()> { + fn flush_fid_word_count(&mut self, buffer: &mut BumpVec) -> Result<()> { for (fid, (current_count, new_count)) in self.fid_word_count.drain() { if current_count != new_count { if current_count <= MAX_COUNTED_WORDS { @@ -415,11 +423,11 @@ impl WordDocidsExtractors { let cached_sorter = cached_sorter.deref_mut(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut(); let new_fields_ids_map = new_fields_ids_map.deref_mut(); + let doc_alloc = &context.doc_alloc; let exact_attributes = index.exact_attributes(rtxn)?; let is_exact_attribute = |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr)); - let mut buffer = Vec::new(); match document_change { DocumentChange::Deletion(inner) => { let mut token_fn = |fname: &str, fid, pos, word: &str| { @@ -430,7 +438,7 @@ impl WordDocidsExtractors { word, is_exact_attribute(fname), inner.docid(), - &mut buffer, + doc_alloc, ) .map_err(crate::Error::from) }; @@ -449,7 +457,7 @@ impl WordDocidsExtractors { word, is_exact_attribute(fname), inner.docid(), - &mut buffer, + doc_alloc, ) .map_err(crate::Error::from) }; @@ -467,7 +475,7 @@ impl WordDocidsExtractors { word, is_exact_attribute(fname), inner.docid(), - &mut buffer, + doc_alloc, ) .map_err(crate::Error::from) }; @@ -486,7 +494,7 @@ impl WordDocidsExtractors { word, is_exact_attribute(fname), inner.docid(), - &mut buffer, + doc_alloc, ) .map_err(crate::Error::from) }; @@ -498,6 +506,8 @@ impl WordDocidsExtractors { } } + let buffer_size = size_of::(); + let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc); cached_sorter.flush_fid_word_count(&mut buffer) } From 0647f75e6b6edba4154767133cd355dcd3f13095 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 16 Oct 2024 17:36:41 +0200 Subject: [PATCH 153/247] Add borrow_mut_or_yield extension method --- .../new/extract/faceted/extract_facets.rs | 6 +-- .../extract/searchable/extract_word_docids.rs | 6 +-- .../extract_word_pair_proximity_docids.rs | 6 +-- .../update/new/indexer/document_changes.rs | 47 ++++++++++++++++++- milli/src/update/new/indexer/mod.rs | 5 +- milli/src/update/new/indexer/partial_dump.rs | 4 +- .../update/new/indexer/update_by_function.rs | 4 +- 7 files changed, 61 insertions(+), 17 deletions(-) diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 14cc28da4..82f80c7b5 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -17,7 +17,7 @@ use crate::facet::value_encoding::f64_into_bytes; use crate::update::new::extract::DocidsExtractor; use crate::update::new::indexer::document_changes::{ for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, - IndexingContext, ThreadLocal, + IndexingContext, RefCellExt, ThreadLocal, }; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; @@ -71,8 +71,8 @@ impl FacetedDocidsExtractor { ) -> Result<()> { let index = &context.index; let rtxn = &context.txn; - let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut(); - let mut cached_sorter = context.data.0.borrow_mut(); + let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); + let mut cached_sorter = context.data.0.borrow_mut_or_yield(); match document_change { DocumentChange::Deletion(inner) => extract_document_facets( attributes_to_extract, diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index fd74cc8ce..5d70408bb 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -13,7 +13,7 @@ use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, - IndexingContext, ThreadLocal, + IndexingContext, RefCellExt, ThreadLocal, }; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; @@ -411,9 +411,9 @@ impl WordDocidsExtractors { ) -> Result<()> { let index = &context.index; let rtxn = &context.txn; - let mut cached_sorter = context.data.0.borrow_mut(); + let mut cached_sorter = context.data.0.borrow_mut_or_yield(); let cached_sorter = cached_sorter.deref_mut(); - let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut(); + let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let new_fields_ids_map = new_fields_ids_map.deref_mut(); let exact_attributes = index.exact_attributes(rtxn)?; diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 86ede5b14..53e6515a9 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -9,7 +9,7 @@ use super::SearchableExtractor; use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::update::new::document::Document; use crate::update::new::extract::cache::CboCachedSorter; -use crate::update::new::indexer::document_changes::{DocumentChangeContext, FullySend}; +use crate::update::new::indexer::document_changes::{DocumentChangeContext, FullySend, RefCellExt}; use crate::update::new::DocumentChange; use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; @@ -45,10 +45,10 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc); let mut add_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc); - let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut(); + let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let new_fields_ids_map = &mut *new_fields_ids_map; - let mut cached_sorter = context.data.0.borrow_mut(); + let mut cached_sorter = context.data.0.borrow_mut_or_yield(); let cached_sorter = &mut *cached_sorter; // is a vecdequeue, and will be smol, so can stay on the heap for now diff --git a/milli/src/update/new/indexer/document_changes.rs b/milli/src/update/new/indexer/document_changes.rs index 18c7cdf02..5d9e7b3ba 100644 --- a/milli/src/update/new/indexer/document_changes.rs +++ b/milli/src/update/new/indexer/document_changes.rs @@ -1,4 +1,4 @@ -use std::cell::{Cell, RefCell}; +use std::cell::{Cell, Ref, RefCell, RefMut}; use std::sync::{Arc, RwLock}; use bumpalo::Bump; @@ -10,6 +10,49 @@ use super::super::document_change::DocumentChange; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result}; +pub trait RefCellExt { + fn try_borrow_or_yield(&self) -> std::result::Result, std::cell::BorrowError>; + fn try_borrow_mut_or_yield( + &self, + ) -> std::result::Result, std::cell::BorrowMutError>; + + fn borrow_or_yield(&self) -> Ref<'_, T> { + self.try_borrow_or_yield().unwrap() + } + + fn borrow_mut_or_yield(&self) -> RefMut<'_, T> { + self.try_borrow_mut_or_yield().unwrap() + } +} + +impl RefCellExt for RefCell { + fn try_borrow_or_yield(&self) -> std::result::Result, std::cell::BorrowError> { + loop { + match self.try_borrow() { + Ok(borrow) => break Ok(borrow), + Err(error) => match rayon::yield_local() { + Some(rayon::Yield::Executed) => continue, + _ => return Err(error), + }, + } + } + } + + fn try_borrow_mut_or_yield( + &self, + ) -> std::result::Result, std::cell::BorrowMutError> { + loop { + match self.try_borrow_mut() { + Ok(borrow) => break Ok(borrow), + Err(error) => match rayon::yield_local() { + Some(rayon::Yield::Executed) => continue, + _ => return Err(error), + }, + } + } + } +} + /// A trait for types that are **not** [`Send`] only because they would then allow concurrent access to a type that is not [`Sync`]. /// /// The primary example of such a type is `&T`, with `T: !Sync`. @@ -245,7 +288,7 @@ impl< let fields_ids_map = &fields_ids_map.0; let extractor_alloc = extractor_allocs.get_or_default(); - let extractor_alloc = RefBump::new(extractor_alloc.0.borrow()); + let extractor_alloc = RefBump::new(extractor_alloc.0.borrow_or_yield()); let data = datastore.get_or_try(|| init_data(RefBump::clone(&extractor_alloc)))?; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 5f0face8b..29ff2685e 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -5,7 +5,8 @@ use std::thread::{self, Builder}; use big_s::S; use bumpalo::Bump; use document_changes::{ - for_each_document_change, DocumentChanges, Extractor, FullySend, IndexingContext, ThreadLocal, + for_each_document_change, DocumentChanges, Extractor, FullySend, IndexingContext, RefCellExt, + ThreadLocal, }; pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; @@ -62,7 +63,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { ) -> Result<()> { let mut document_buffer = Vec::new(); - let new_fields_ids_map = context.new_fields_ids_map.borrow(); + let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield(); let new_fields_ids_map = &*new_fields_ids_map; let new_fields_ids_map = new_fields_ids_map.local_map(); diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 3b528d5e8..10fc95a03 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -5,7 +5,7 @@ use serde::Deserializer; use serde_json::value::RawValue; use super::de::FieldAndDocidExtractor; -use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; +use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend, RefCellExt}; use crate::documents::{DocumentIdExtractionError, PrimaryKey}; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; use crate::update::new::document::DocumentFromVersions; @@ -63,7 +63,7 @@ where None => return Err(Error::UserError(UserError::DocumentLimitReached)), }; - let mut fields_ids_map = context.new_fields_ids_map.borrow_mut(); + let mut fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let fields_ids_map = fields_ids_map.deref_mut(); let document = doc_alloc.alloc_str(document.get()); diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index 6f2914577..826f918a4 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -5,7 +5,7 @@ use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIter use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use roaring::RoaringBitmap; -use super::document_changes::{DocumentChangeContext, MostlySend}; +use super::document_changes::{DocumentChangeContext, MostlySend, RefCellExt}; use super::DocumentChanges; use crate::documents::Error::InvalidDocumentFormat; use crate::documents::PrimaryKey; @@ -142,7 +142,7 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { // Future: Use a custom function rhai function to track changes. // if json_document != rhaimap_to_object(new_rhai_document) { - let mut global_fields_ids_map = new_fields_ids_map.borrow_mut(); + let mut global_fields_ids_map = new_fields_ids_map.borrow_mut_or_yield(); let new_document_id = self .primary_key .extract_fields_and_docid( From 07496336189c7556628c6f8876116fa9e4a88893 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 17 Oct 2024 09:30:18 +0200 Subject: [PATCH 154/247] Don't sort in parallel in sorters of the new indexer --- .../index_documents/extract/extract_docid_word_positions.rs | 1 + .../index_documents/extract/extract_facet_number_docids.rs | 1 + .../index_documents/extract/extract_facet_string_docids.rs | 4 ++++ .../extract/extract_fid_docid_facet_values.rs | 2 ++ .../index_documents/extract/extract_fid_word_count_docids.rs | 1 + .../update/index_documents/extract/extract_word_docids.rs | 3 +++ .../extract/extract_word_pair_proximity_docids.rs | 1 + .../index_documents/extract/extract_word_position_docids.rs | 1 + milli/src/update/index_documents/helpers/grenad_helpers.rs | 3 ++- milli/src/update/index_documents/transform.rs | 4 ++++ milli/src/update/new/extract/faceted/extract_facets.rs | 4 ++++ .../src/update/new/extract/searchable/extract_word_docids.rs | 5 +++++ milli/src/update/new/extract/searchable/mod.rs | 1 + milli/src/update/word_prefix_docids.rs | 1 + milli/src/update/words_prefix_integer_docids.rs | 1 + 15 files changed, 32 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 716e4dd6b..b1e6f24be 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -40,6 +40,7 @@ pub fn extract_docid_word_positions( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, + true, ); // initialize buffers. diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 8a5a93270..34bece989 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -32,6 +32,7 @@ pub fn extract_facet_number_docids( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, + true, ); let mut buffer = Vec::new(); diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index f7bdcbb56..e0d7e1386 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -61,6 +61,7 @@ fn extract_facet_string_docids_document_update( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|m| m / 2), + true, ); let mut normalized_facet_string_docids_sorter = create_sorter( @@ -70,6 +71,7 @@ fn extract_facet_string_docids_document_update( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|m| m / 2), + true, ); let mut buffer = Vec::new(); @@ -149,6 +151,7 @@ fn extract_facet_string_docids_settings( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|m| m / 2), + true, ); let mut normalized_facet_string_docids_sorter = create_sorter( @@ -158,6 +161,7 @@ fn extract_facet_string_docids_settings( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|m| m / 2), + true, ); let mut buffer = Vec::new(); diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index f7f447ca9..047669521 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -53,6 +53,7 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|m| m / 2), + true, ); let mut fid_docid_facet_strings_sorter = create_sorter( @@ -62,6 +63,7 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|m| m / 2), + true, ); // The tuples represents the Del and Add side for a bitmap diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 784de5d94..5739a5e15 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -35,6 +35,7 @@ pub fn extract_fid_word_count_docids( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, + true, ); let mut key_buffer = Vec::new(); diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 70db9d759..829da768c 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -44,6 +44,7 @@ pub fn extract_word_docids( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|m| m / 3), + true, ); let mut key_buffer = Vec::new(); let mut del_words = BTreeSet::new(); @@ -98,6 +99,7 @@ pub fn extract_word_docids( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|m| m / 3), + true, ); let mut exact_word_docids_sorter = create_sorter( @@ -107,6 +109,7 @@ pub fn extract_word_docids( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|m| m / 3), + true, ); let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 705a5c96f..6194da23d 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -49,6 +49,7 @@ pub fn extract_word_pair_proximity_docids( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|m| m / MAX_DISTANCE as usize), + true, ) }) .collect(); diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index bee510bfb..f870fbe1b 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -33,6 +33,7 @@ pub fn extract_word_position_docids( indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, + true, ); let mut del_word_positions: BTreeSet<(u16, Vec)> = BTreeSet::new(); diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 1f8f7eddf..220567208 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -37,6 +37,7 @@ pub fn create_sorter( chunk_compression_level: Option, max_nb_chunks: Option, max_memory: Option, + sort_in_parallel: bool, ) -> grenad::Sorter { let mut builder = grenad::Sorter::builder(merge); builder.chunk_compression_type(chunk_compression_type); @@ -51,7 +52,7 @@ pub fn create_sorter( builder.allow_realloc(false); } builder.sort_algorithm(sort_algorithm); - builder.sort_in_parallel(true); + builder.sort_in_parallel(sort_in_parallel); builder.build() } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 65007aa32..84135ff24 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -127,6 +127,7 @@ impl<'a, 'i> Transform<'a, 'i> { indexer_settings.chunk_compression_level, indexer_settings.max_nb_chunks, indexer_settings.max_memory.map(|mem| mem / 2), + true, ); // We initialize the sorter with the user indexing settings. @@ -137,6 +138,7 @@ impl<'a, 'i> Transform<'a, 'i> { indexer_settings.chunk_compression_level, indexer_settings.max_nb_chunks, indexer_settings.max_memory.map(|mem| mem / 2), + true, ); let documents_ids = index.documents_ids(wtxn)?; @@ -988,6 +990,7 @@ impl<'a, 'i> Transform<'a, 'i> { self.indexer_settings.chunk_compression_level, self.indexer_settings.max_nb_chunks, self.indexer_settings.max_memory.map(|mem| mem / 2), + true, )) } else { None @@ -1030,6 +1033,7 @@ impl<'a, 'i> Transform<'a, 'i> { self.indexer_settings.chunk_compression_level, self.indexer_settings.max_nb_chunks, self.indexer_settings.max_memory.map(|mem| mem / 2), + true, )) } else { None diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 82f80c7b5..9f3ed18d8 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -46,6 +46,10 @@ impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> { self.grenad_parameters.chunk_compression_level, self.grenad_parameters.max_nb_chunks, self.max_memory, + // *NOTE*: this must not be set to true: + // 1. we're already using max parallelism in the pool, so it wouldn't help + // 2. it creates correctness issues if it causes to yield a borrow-mut wielding task + false, ), )))) } diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 5d70408bb..c76ab49d0 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -48,6 +48,7 @@ impl WordDocidsCachedSorters { indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, + false, ), ); let word_docids = CboCachedSorter::new( @@ -59,6 +60,7 @@ impl WordDocidsCachedSorters { indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, + false, ), ); let exact_word_docids = CboCachedSorter::new( @@ -70,6 +72,7 @@ impl WordDocidsCachedSorters { indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, + false, ), ); let word_position_docids = CboCachedSorter::new( @@ -81,6 +84,7 @@ impl WordDocidsCachedSorters { indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, + false, ), ); let fid_word_count_docids = CboCachedSorter::new( @@ -92,6 +96,7 @@ impl WordDocidsCachedSorters { indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, + false, ), ); diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 1edeec8b4..8934ee892 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -50,6 +50,7 @@ impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> self.grenad_parameters.chunk_compression_level, self.grenad_parameters.max_nb_chunks, self.max_memory, + false, ), )))) } diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index f683146cf..d129d485e 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -60,6 +60,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> { self.chunk_compression_level, self.max_nb_chunks, self.max_memory, + true, ); if !common_prefix_fst_words.is_empty() { diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs index 28b9b1523..ff974b797 100644 --- a/milli/src/update/words_prefix_integer_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -65,6 +65,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { self.chunk_compression_level, self.max_nb_chunks, self.max_memory, + true, ); if !common_prefix_fst_words.is_empty() { From c1fcb2ebc6d27097c9faa39d917ceb34efa9d2f4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 17 Oct 2024 09:43:11 +0200 Subject: [PATCH 155/247] add some warning --- .../update/new/indexer/document_changes.rs | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/milli/src/update/new/indexer/document_changes.rs b/milli/src/update/new/indexer/document_changes.rs index 5d9e7b3ba..423ddbdcc 100644 --- a/milli/src/update/new/indexer/document_changes.rs +++ b/milli/src/update/new/indexer/document_changes.rs @@ -27,13 +27,17 @@ pub trait RefCellExt { impl RefCellExt for RefCell { fn try_borrow_or_yield(&self) -> std::result::Result, std::cell::BorrowError> { + /// TODO: move this trait and impl elsewhere loop { match self.try_borrow() { Ok(borrow) => break Ok(borrow), - Err(error) => match rayon::yield_local() { - Some(rayon::Yield::Executed) => continue, - _ => return Err(error), - }, + Err(error) => { + tracing::warn!("dynamic borrow failed, yielding to local tasks"); + match rayon::yield_local() { + Some(rayon::Yield::Executed) => continue, + _ => return Err(error), + } + } } } } @@ -44,10 +48,14 @@ impl RefCellExt for RefCell { loop { match self.try_borrow_mut() { Ok(borrow) => break Ok(borrow), - Err(error) => match rayon::yield_local() { - Some(rayon::Yield::Executed) => continue, - _ => return Err(error), - }, + Err(error) => { + tracing::warn!("dynamic borrow failed, yielding to local tasks"); + + match rayon::yield_local() { + Some(rayon::Yield::Executed) => continue, + _ => return Err(error), + } + } } } } @@ -168,6 +176,7 @@ impl ThreadLocal { where F: FnOnce() -> T, { + /// TODO: move ThreadLocal, MostlySend, FullySend to a dedicated file self.inner.get_or(|| unsafe { MostlySendWrapper::new(create()) }).as_ref() } From cd378e5bd2667621088c07c29183329a0aaa343f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 16 Oct 2024 15:44:04 +0200 Subject: [PATCH 156/247] Add chunking --- .../new/extract/faceted/extract_facets.rs | 16 +++-- .../extract/searchable/extract_word_docids.rs | 12 ++-- .../src/update/new/extract/searchable/mod.rs | 12 ++-- .../update/new/indexer/document_changes.rs | 23 +++---- .../update/new/indexer/document_deletion.rs | 25 +++++--- .../update/new/indexer/document_operation.rs | 12 ++-- milli/src/update/new/indexer/mod.rs | 62 ++++++++++--------- milli/src/update/new/indexer/partial_dump.rs | 9 ++- .../update/new/indexer/update_by_function.rs | 20 +++--- 9 files changed, 115 insertions(+), 76 deletions(-) diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 9f3ed18d8..9fae1839e 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -54,12 +54,20 @@ impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> { )))) } - fn process( + fn process<'doc>( &self, - change: DocumentChange, - context: &crate::update::new::indexer::document_changes::DocumentChangeContext, + changes: impl Iterator>>, + context: &DocumentChangeContext, ) -> Result<()> { - FacetedDocidsExtractor::extract_document_change(context, self.attributes_to_extract, change) + for change in changes { + let change = change?; + FacetedDocidsExtractor::extract_document_change( + context, + self.attributes_to_extract, + change, + )? + } + Ok(()) } } diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index c76ab49d0..5eb9692d6 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -325,12 +325,16 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> { )))) } - fn process( + fn process<'doc>( &self, - change: DocumentChange, - context: &crate::update::new::indexer::document_changes::DocumentChangeContext, + changes: impl Iterator>>, + context: &DocumentChangeContext, ) -> Result<()> { - WordDocidsExtractors::extract_document_change(context, self.tokenizer, change) + for change in changes { + let change = change?; + WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?; + } + Ok(()) } } diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 8934ee892..dc429b1ba 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -55,12 +55,16 @@ impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> )))) } - fn process( + fn process<'doc>( &self, - change: DocumentChange, - context: &crate::update::new::indexer::document_changes::DocumentChangeContext, + changes: impl Iterator>>, + context: &DocumentChangeContext, ) -> Result<()> { - EX::extract_document_change(context, self.tokenizer, change) + for change in changes { + let change = change?; + EX::extract_document_change(context, self.tokenizer, change)?; + } + Ok(()) } } diff --git a/milli/src/update/new/indexer/document_changes.rs b/milli/src/update/new/indexer/document_changes.rs index 423ddbdcc..91c65a6d1 100644 --- a/milli/src/update/new/indexer/document_changes.rs +++ b/milli/src/update/new/indexer/document_changes.rs @@ -323,7 +323,7 @@ pub trait Extractor<'extractor>: Sync { fn process<'doc>( &'doc self, - change: DocumentChange<'doc>, + changes: impl Iterator>>, context: &'doc DocumentChangeContext, ) -> Result<()>; } @@ -332,13 +332,13 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload >: Sync { type Item: Send; - fn iter(&self) -> impl IndexedParallelIterator; + fn iter(&self, chunk_size: usize) -> impl IndexedParallelIterator>; fn item_to_document_change<'doc, // lifetime of a single `process` call T: MostlySend>( &'doc self, context: &'doc DocumentChangeContext, - item: Self::Item, + item: &'doc Self::Item, ) -> Result>> where 'pl: 'doc // the payload must survive the process calls ; } @@ -356,6 +356,8 @@ pub struct IndexingContext< pub fields_ids_map_store: &'indexer ThreadLocal>>>, } +const CHUNK_SIZE: usize = 100; + pub fn for_each_document_change< 'pl, // covariant lifetime of the underlying payload 'extractor, // invariant lifetime of extractor_alloc @@ -386,7 +388,7 @@ where extractor_alloc.0.get_mut().reset(); } - let pi = document_changes.iter(); + let pi = document_changes.iter(CHUNK_SIZE); pi.try_arc_for_each_try_init( || { DocumentChangeContext::new( @@ -400,17 +402,16 @@ where move |index_alloc| extractor.init_data(index_alloc), ) }, - |context, item| { + |context, items| { // Clean up and reuse the document-specific allocator context.doc_alloc.reset(); - let Some(change) = - document_changes.item_to_document_change(context, item).map_err(Arc::new)? - else { - return Ok(()); - }; + let items = items.as_ref(); + let changes = items.iter().filter_map(|item| { + document_changes.item_to_document_change(context, item).transpose() + }); - let res = extractor.process(change, context).map_err(Arc::new); + let res = extractor.process(changes, context).map_err(Arc::new); // send back the doc_alloc in the pool context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc)); diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index a9628f419..bbd2b11ac 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -1,6 +1,7 @@ use bumpalo::collections::CollectIn; use bumpalo::Bump; -use rayon::iter::{IntoParallelIterator, ParallelIterator as _}; +use rayon::iter::IndexedParallelIterator; +use rayon::slice::ParallelSlice as _; use roaring::RoaringBitmap; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; @@ -44,8 +45,11 @@ pub struct DocumentDeletionChanges<'indexer> { impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { type Item = DocumentId; - fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator { - self.to_delete.into_par_iter().copied() + fn iter( + &self, + chunk_size: usize, + ) -> impl IndexedParallelIterator> { + self.to_delete.par_chunks(chunk_size) } fn item_to_document_change< @@ -54,12 +58,12 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { >( &'doc self, context: &'doc DocumentChangeContext, - docid: Self::Item, + docid: &'doc Self::Item, ) -> Result>> where 'pl: 'doc, // the payload must survive the process calls { - let current = context.index.document(&context.txn, docid)?; + let current = context.index.document(&context.txn, *docid)?; let external_document_id = self.primary_key.extract_docid_from_db( current, @@ -69,7 +73,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { let external_document_id = external_document_id.to_bump(&context.doc_alloc); - Ok(Some(DocumentChange::Deletion(Deletion::create(docid, external_document_id)))) + Ok(Some(DocumentChange::Deletion(Deletion::create(*docid, external_document_id)))) } } @@ -118,12 +122,15 @@ mod test { Ok(DeletionWithData { deleted }) } - fn process( + fn process<'doc>( &self, - change: DocumentChange, + changes: impl Iterator>>, context: &DocumentChangeContext, ) -> crate::Result<()> { - context.data.deleted.borrow_mut().insert(change.docid()); + for change in changes { + let change = change?; + context.data.deleted.borrow_mut().insert(change.docid()); + } Ok(()) } } diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 57ec46a41..ee4517e20 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -3,6 +3,7 @@ use bumpalo::Bump; use heed::RoTxn; use memmap2::Mmap; use rayon::iter::IntoParallelIterator; +use rayon::slice::ParallelSlice; use serde_json::value::RawValue; use IndexDocumentsMethod as Idm; @@ -209,16 +210,19 @@ impl<'pl> DocumentOperation<'pl> { } impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> { - type Item = &'pl (&'pl str, ((u32, bool), &'pl [InnerDocOp<'pl>])); + type Item = (&'pl str, ((u32, bool), &'pl [InnerDocOp<'pl>])); - fn iter(&self) -> impl rayon::prelude::IndexedParallelIterator { - self.docids_version_offsets.into_par_iter() + fn iter( + &self, + chunk_size: usize, + ) -> impl rayon::prelude::IndexedParallelIterator> { + self.docids_version_offsets.par_chunks(chunk_size) } fn item_to_document_change<'doc, T: MostlySend + 'doc>( &'doc self, context: &'doc DocumentChangeContext, - item: Self::Item, + item: &'doc Self::Item, ) -> Result>> where 'pl: 'doc, diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 29ff2685e..d4e6ca6a6 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -5,8 +5,8 @@ use std::thread::{self, Builder}; use big_s::S; use bumpalo::Bump; use document_changes::{ - for_each_document_change, DocumentChanges, Extractor, FullySend, IndexingContext, RefCellExt, - ThreadLocal, + for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, + IndexingContext, RefCellExt, ThreadLocal, }; pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; @@ -33,7 +33,7 @@ use crate::update::new::channel::ExtractorSender; use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; pub(crate) mod de; pub mod document_changes; @@ -56,10 +56,10 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { Ok(FullySend(())) } - fn process( + fn process<'doc>( &self, - change: DocumentChange, - context: &document_changes::DocumentChangeContext, + changes: impl Iterator>>, + context: &DocumentChangeContext, ) -> Result<()> { let mut document_buffer = Vec::new(); @@ -67,32 +67,36 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { let new_fields_ids_map = &*new_fields_ids_map; let new_fields_ids_map = new_fields_ids_map.local_map(); - let external_docid = change.external_docid().to_owned(); + for change in changes { + let change = change?; + let external_docid = change.external_docid().to_owned(); - // document but we need to create a function that collects and compresses documents. - match change { - DocumentChange::Deletion(deletion) => { - let docid = deletion.docid(); - self.document_sender.delete(docid, external_docid).unwrap(); - } - /// TODO: change NONE by SOME(vector) when implemented - DocumentChange::Update(update) => { - let docid = update.docid(); - let content = - update.new(&context.txn, context.index, &context.db_fields_ids_map)?; - let content = - write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; - self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); - } - DocumentChange::Insertion(insertion) => { - let docid = insertion.docid(); - let content = insertion.new(); - let content = - write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; - self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); - // extracted_dictionary_sender.send(self, dictionary: &[u8]); + // document but we need to create a function that collects and compresses documents. + match change { + DocumentChange::Deletion(deletion) => { + let docid = deletion.docid(); + self.document_sender.delete(docid, external_docid).unwrap(); + } + /// TODO: change NONE by SOME(vector) when implemented + DocumentChange::Update(update) => { + let docid = update.docid(); + let content = + update.new(&context.txn, context.index, &context.db_fields_ids_map)?; + let content = + write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; + self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); + } + DocumentChange::Insertion(insertion) => { + let docid = insertion.docid(); + let content = insertion.new(); + let content = + write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; + self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); + // extracted_dictionary_sender.send(self, dictionary: &[u8]); + } } } + Ok(()) } } diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 10fc95a03..470dbc9d5 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -45,14 +45,17 @@ where { type Item = Box; - fn iter(&self) -> impl IndexedParallelIterator { - self.iter.clone() + fn iter( + &self, + chunk_size: usize, + ) -> impl IndexedParallelIterator> { + self.iter.clone().chunks(chunk_size) } fn item_to_document_change<'doc, T: MostlySend + 'doc>( &'doc self, context: &'doc DocumentChangeContext, - document: Self::Item, + document: &'doc Self::Item, ) -> Result>> where 'index: 'doc, diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index 826f918a4..4c65edcc3 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -1,7 +1,6 @@ -use std::collections::BTreeMap; - use raw_collections::RawMap; -use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; +use rayon::iter::IndexedParallelIterator; +use rayon::slice::ParallelSlice as _; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use roaring::RoaringBitmap; @@ -12,8 +11,8 @@ use crate::documents::PrimaryKey; use crate::error::{FieldIdMapMissingEntry, InternalError}; use crate::update::new::document::DocumentFromVersions; use crate::update::new::document_change::Versions; -use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, KvWriterFieldId, Update}; -use crate::{all_obkv_to_json, Error, FieldsIdsMap, GlobalFieldsIdsMap, Object, Result, UserError}; +use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update}; +use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; pub struct UpdateByFunction { documents: RoaringBitmap, @@ -76,14 +75,17 @@ impl UpdateByFunction { impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { type Item = u32; - fn iter(&self) -> impl IndexedParallelIterator { - self.documents.par_iter().copied() + fn iter( + &self, + chunk_size: usize, + ) -> impl IndexedParallelIterator> { + self.documents.as_slice().par_chunks(chunk_size) } fn item_to_document_change<'doc, T: MostlySend + 'doc>( &self, context: &'doc DocumentChangeContext, - docid: Self::Item, + docid: &'doc Self::Item, ) -> Result>> where 'index: 'doc, @@ -97,6 +99,8 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { .. } = context; + let docid = *docid; + // safety: Both documents *must* exists in the database as // their IDs comes from the list of documents ids. let document = index.document(txn, docid)?; From 60cc09abec465f63aab239ca0edb2a4e1b5799db Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 21 Oct 2024 09:28:49 +0200 Subject: [PATCH 157/247] Implement facet search exctraction --- milli/src/update/new/channel.rs | 54 ++++ milli/src/update/new/facet_search_builder.rs | 275 +++++++++++++++++++ milli/src/update/new/fst_merger_builder.rs | 155 +++++++++++ milli/src/update/new/merger.rs | 18 +- milli/src/update/new/mod.rs | 2 + milli/src/update/new/word_fst_builder.rs | 127 ++------- 6 files changed, 521 insertions(+), 110 deletions(-) create mode 100644 milli/src/update/new/facet_search_builder.rs create mode 100644 milli/src/update/new/fst_merger_builder.rs diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 8226046e6..d63180ba1 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -144,6 +144,8 @@ pub enum Database { FacetIdExistsDocids, FacetIdF64NumberDocids, FacetIdStringDocids, + FacetIdNormalizedStringStrings, + FacetIdStringFst, } impl Database { @@ -163,6 +165,10 @@ impl Database { Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(), Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(), Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), + Database::FacetIdNormalizedStringStrings => { + index.facet_id_normalized_string_strings.remap_types() + } + Database::FacetIdStringFst => index.facet_id_string_fst.remap_types(), } } } @@ -240,6 +246,10 @@ impl MergerSender { DocumentsSender(self) } + pub fn facet_searchable(&self) -> FacetSearchableSender<'_> { + FacetSearchableSender { sender: self } + } + pub fn send_documents_ids(&self, documents_ids: RoaringBitmap) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Write(KeyValueEntry::from_small_key_bitmap( DOCUMENTS_IDS_KEY.as_bytes(), @@ -445,6 +455,50 @@ impl DocidsSender for FacetDocidsSender<'_> { } } +pub struct FacetSearchableSender<'a> { + sender: &'a MergerSender, +} + +impl FacetSearchableSender<'_> { + pub fn write_facet(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); + match self + .sender + .send(WriterOperation { database: Database::FacetIdNormalizedStringStrings, entry }) + { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + pub fn delete_facet(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Delete(KeyEntry::from_key(key)); + match self + .sender + .send(WriterOperation { database: Database::FacetIdNormalizedStringStrings, entry }) + { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + pub fn write_fst(&self, key: &[u8], value: Mmap) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value(key, value)); + match self.sender.send(WriterOperation { database: Database::FacetIdStringFst, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } + + pub fn delete_fst(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + let entry = EntryOperation::Delete(KeyEntry::from_key(key)); + match self.sender.send(WriterOperation { database: Database::FacetIdStringFst, entry }) { + Ok(()) => Ok(()), + Err(SendError(_)) => Err(SendError(())), + } + } +} + pub struct DocumentsSender<'a>(&'a MergerSender); impl DocumentsSender<'_> { diff --git a/milli/src/update/new/facet_search_builder.rs b/milli/src/update/new/facet_search_builder.rs new file mode 100644 index 000000000..4602b5a30 --- /dev/null +++ b/milli/src/update/new/facet_search_builder.rs @@ -0,0 +1,275 @@ +use std::collections::{BTreeSet, HashMap}; + +use charabia::{normalizer::NormalizerOption, Language, Normalize, StrDetection, Token}; +use grenad::Sorter; +use heed::{ + types::{Bytes, SerdeJson}, + BytesDecode, BytesEncode, RoTxn, +}; + +use crate::{ + heed_codec::{ + facet::{FacetGroupKey, FacetGroupKeyCodec}, + StrRefCodec, + }, + update::{ + create_sorter, + del_add::{DelAdd, KvWriterDelAdd}, + MergeDeladdBtreesetString, + }, + BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result, + MAX_FACET_VALUE_LENGTH, +}; + +use super::{ + channel::FacetSearchableSender, extract::FacetKind, fst_merger_builder::FstMergerBuilder, + KvReaderDelAdd, +}; + +pub struct FacetSearchBuilder<'indexer> { + registered_facets: HashMap, + normalized_facet_string_docids_sorter: Sorter, + global_fields_ids_map: GlobalFieldsIdsMap<'indexer>, + localized_attributes_rules: Vec, + // Buffered data below + buffer: Vec, + localized_field_ids: HashMap>>, +} + +impl<'indexer> FacetSearchBuilder<'indexer> { + pub fn new( + global_fields_ids_map: GlobalFieldsIdsMap<'indexer>, + localized_attributes_rules: Vec, + ) -> Self { + let registered_facets = HashMap::new(); + let normalized_facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + MergeDeladdBtreesetString, + grenad::CompressionType::None, + None, + None, + Some(0), + ); + + Self { + registered_facets, + normalized_facet_string_docids_sorter, + buffer: Vec::new(), + global_fields_ids_map, + localized_attributes_rules, + localized_field_ids: HashMap::new(), + } + } + + fn extract_key_data<'k>(&self, key: &'k [u8]) -> Result>> { + match FacetKind::from(key[0]) { + // Only strings are searchable + FacetKind::String => Ok(Some( + FacetGroupKeyCodec::::bytes_decode(&key[1..]) + .map_err(heed::Error::Encoding)?, + )), + _ => Ok(None), + } + } + + pub fn register_from_key(&mut self, deladd: DelAdd, facet_key: &[u8]) -> Result<()> { + let Some(FacetGroupKey { field_id, level: _level, left_bound }) = + self.extract_key_data(facet_key)? + else { + return Ok(()); + }; + + if deladd == DelAdd::Addition { + self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1); + } + + let locales = self.locales(field_id); + let hyper_normalized_value = normalize_facet_string(left_bound, locales.as_deref()); + + let set = BTreeSet::from_iter(std::iter::once(left_bound)); + + // as the facet string is the same, we can put the deletion and addition in the same obkv. + self.buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut self.buffer); + let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; + obkv.insert(deladd, val)?; + obkv.finish()?; + + let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref()); + let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + self.normalized_facet_string_docids_sorter.insert(key_bytes, &self.buffer)?; + + Ok(()) + } + + fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> { + if self.localized_field_ids.get(&field_id).is_none() { + let Some(field_name) = self.global_fields_ids_map.name(field_id) else { + unreachable!("Field id {} not found in the global fields ids map", field_id); + }; + + let locales = self + .localized_attributes_rules + .iter() + .find(|rule| rule.match_str(field_name)) + .map(|rule| rule.locales.clone()); + + self.localized_field_ids.insert(field_id, locales); + } + + self.localized_field_ids.get(&field_id).unwrap().as_deref() + } + + #[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")] + pub fn merge_and_send( + self, + index: &Index, + rtxn: &RoTxn<'_>, + sender: FacetSearchableSender, + ) -> Result<()> { + let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?; + let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString); + builder.extend(reader); + + let database = index.facet_id_normalized_string_strings.remap_types::(); + + let mut merger_iter = builder.build().into_stream_merger_iter()?; + let mut current_field_id = None; + let mut fst; + let mut fst_merger_builder: Option = None; + while let Some((key, deladd)) = merger_iter.next()? { + let (field_id, normalized_facet_string) = + BEU16StrCodec::bytes_decode(&key).map_err(heed::Error::Encoding)?; + + if current_field_id != Some(field_id) { + if let Some(fst_merger_builder) = fst_merger_builder { + // send the previous fst to the channel + let mmap = fst_merger_builder.build(&mut callback)?; + sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap(); + } + + println!("getting fst for field_id: {}", field_id); + fst = index.facet_id_string_fst.get(rtxn, &field_id)?; + fst_merger_builder = Some(FstMergerBuilder::new(fst.as_ref())?); + current_field_id = Some(field_id); + } + + let current = database.get(rtxn, key)?; + let deladd: &KvReaderDelAdd = deladd.into(); + let del = deladd.get(DelAdd::Deletion); + let add = deladd.get(DelAdd::Addition); + + match merge_btreesets(current, del, add)? { + Operation::Write(value) => { + match fst_merger_builder.as_mut() { + Some(fst_merger_builder) => { + fst_merger_builder.register( + DelAdd::Addition, + normalized_facet_string.as_bytes(), + &mut callback, + )?; + } + None => unreachable!(), + } + let key = (field_id, normalized_facet_string); + let key_bytes = + BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + sender.write_facet(&key_bytes, &value).unwrap(); + } + Operation::Delete => { + match fst_merger_builder.as_mut() { + Some(fst_merger_builder) => { + fst_merger_builder.register( + DelAdd::Deletion, + normalized_facet_string.as_bytes(), + &mut callback, + )?; + } + None => unreachable!(), + } + let key = (field_id, normalized_facet_string); + let key_bytes = + BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + sender.delete_facet(&key_bytes).unwrap(); + } + Operation::Ignore => (), + } + } + + if let (Some(field_id), Some(fst_merger_builder)) = (current_field_id, fst_merger_builder) { + let mmap = fst_merger_builder.build(&mut callback)?; + sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap(); + } + + Ok(()) + } +} + +fn callback(_bytes: &[u8], _deladd: DelAdd, _is_modified: bool) -> Result<()> { + Ok(()) +} + +fn merge_btreesets<'a>( + current: Option<&[u8]>, + del: Option<&[u8]>, + add: Option<&[u8]>, +) -> Result { + let mut result: BTreeSet = match current { + Some(current) => SerdeJson::bytes_decode(current).map_err(heed::Error::Encoding)?, + None => BTreeSet::new(), + }; + if let Some(del) = del { + let del: BTreeSet = SerdeJson::bytes_decode(del).map_err(heed::Error::Encoding)?; + result = result.difference(&del).cloned().collect(); + } + if let Some(add) = add { + let add: BTreeSet = SerdeJson::bytes_decode(add).map_err(heed::Error::Encoding)?; + result.extend(add); + } + + /// TODO remove allocation + let result = SerdeJson::bytes_encode(&result).map_err(heed::Error::Encoding)?.into_owned(); + if Some(result.as_ref()) == current { + Ok(Operation::Ignore) + } else if result.is_empty() { + Ok(Operation::Delete) + } else { + Ok(Operation::Write(result)) + } +} + +/// Normalizes the facet string and truncates it to the max length. +fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { + let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() }; + let mut detection = StrDetection::new(facet_string, locales); + + let script = detection.script(); + // Detect the language of the facet string only if several locales are explicitly provided. + let language = match locales { + Some(&[language]) => Some(language), + Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(), + _ => None, + }; + + let token = Token { + lemma: std::borrow::Cow::Borrowed(facet_string), + script, + language, + ..Default::default() + }; + + // truncate the facet string to the max length + token + .normalize(&options) + .lemma + .char_indices() + .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect() +} + +enum Operation { + Write(Vec), + Delete, + Ignore, +} diff --git a/milli/src/update/new/fst_merger_builder.rs b/milli/src/update/new/fst_merger_builder.rs new file mode 100644 index 000000000..9fd259ce6 --- /dev/null +++ b/milli/src/update/new/fst_merger_builder.rs @@ -0,0 +1,155 @@ +use std::{fs::File, io::BufWriter}; + +use fst::{Set, SetBuilder, Streamer}; +use memmap2::Mmap; +use tempfile::tempfile; + +use crate::{update::del_add::DelAdd, InternalError, Result}; + +pub struct FstMergerBuilder<'a> { + stream: Option>, + fst_builder: SetBuilder>, + last: Option>, + inserted_words: usize, +} + +impl<'a> FstMergerBuilder<'a> { + pub fn new>(fst: Option<&'a Set>) -> Result { + Ok(Self { + stream: fst.map(|fst| fst.stream()), + fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?, + last: None, + inserted_words: 0, + }) + } + + pub fn register( + &mut self, + deladd: DelAdd, + right: &[u8], + insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>, + ) -> Result<()> { + if let Some(left) = self.last.take() { + let (left_inserted, right_inserted) = + self.compare_and_insert(deladd, left.as_slice(), right, insertion_callback)?; + + // left was not inserted, so we keep it for the next iteration + if !left_inserted { + self.last = Some(left); + } + + // right was inserted, so we can stop + if right_inserted { + return Ok(()); + } + } + + if let Some(mut stream) = self.stream.take() { + while let Some(left) = stream.next() { + let (left_inserted, right_inserted) = + self.compare_and_insert(deladd, left, right, insertion_callback)?; + + // left was not inserted, so we keep it for the next iteration + if !left_inserted { + self.last = Some(left.to_vec()); + } + + // right was inserted, so we can stop + if right_inserted { + self.stream = Some(stream); + return Ok(()); + } + } + } + + // If we reach this point, it means that the stream is empty + // and we need to insert the incoming word + self.insert(right, deladd, true, insertion_callback)?; + + Ok(()) + } + + fn compare_and_insert( + &mut self, + deladd: DelAdd, + left: &[u8], + right: &[u8], + insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>, + ) -> Result<(bool, bool)> { + let mut left_inserted = false; + let mut right_inserted = false; + match left.cmp(right) { + std::cmp::Ordering::Less => { + // We need to insert the last word from the current fst + self.insert(left, DelAdd::Addition, false, insertion_callback)?; + + left_inserted = true; + } + std::cmp::Ordering::Equal => { + self.insert(right, deladd, true, insertion_callback)?; + + left_inserted = true; + right_inserted = true; + } + std::cmp::Ordering::Greater => { + self.insert(right, deladd, true, insertion_callback)?; + + right_inserted = true; + } + } + + Ok((left_inserted, right_inserted)) + } + + fn insert( + &mut self, + bytes: &[u8], + deladd: DelAdd, + is_modified: bool, + insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>, + ) -> Result<()> { + // Addition: We insert the word + // Deletion: We delete the word by not inserting it + if deladd == DelAdd::Addition { + self.inserted_words += 1; + self.fst_builder.insert(bytes)?; + } + + insertion_callback(bytes, deladd, is_modified)?; + + Ok(()) + } + + fn drain_stream( + &mut self, + insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>, + ) -> Result<()> { + if let Some(last) = self.last.take() { + self.insert(last.as_slice(), DelAdd::Addition, false, insertion_callback)?; + } + + if let Some(mut stream) = self.stream.take() { + while let Some(current) = stream.next() { + self.insert(current, DelAdd::Addition, false, insertion_callback)?; + } + } + + Ok(()) + } + + pub fn build( + mut self, + insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>, + ) -> Result { + self.drain_stream(insertion_callback)?; + + let fst_file = self + .fst_builder + .into_inner()? + .into_inner() + .map_err(|_| InternalError::IndexingMergingKeys { process: "building-fst" })?; + let fst_mmap = unsafe { Mmap::map(&fst_file)? }; + + Ok(fst_mmap) + } +} diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 6183beb63..740b215e2 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -10,13 +10,17 @@ use roaring::RoaringBitmap; use super::channel::*; use super::extract::FacetKind; +use super::facet_search_builder::FacetSearchBuilder; use super::word_fst_builder::{PrefixData, PrefixDelta}; use super::{Deletion, DocumentChange, KvReaderDelAdd, KvReaderFieldId}; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; use crate::update::new::word_fst_builder::WordFstBuilder; use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, Result}; +use crate::{ + localized_attributes_rules, CboRoaringBitmapCodec, Error, FieldId, GeoPoint, + GlobalFieldsIdsMap, Index, Result, +}; /// TODO We must return some infos/stats #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")] @@ -170,6 +174,12 @@ pub fn merge_grenad_entries( tracing::trace_span!(target: "indexing::documents::merge", "facet_docids"); let _entered = span.enter(); let mut facet_field_ids_delta = FacetFieldIdsDelta::new(); + let localized_attributes_rules = + index.localized_attributes_rules(rtxn)?.unwrap_or_default(); + let mut facet_search_builder = FacetSearchBuilder::new( + global_fields_ids_map.clone(), + localized_attributes_rules, + ); merge_and_send_facet_docids( merger, FacetDatabases::new(index), @@ -177,9 +187,12 @@ pub fn merge_grenad_entries( &mut buffer, sender.facet_docids(), &mut facet_field_ids_delta, + &mut facet_search_builder, )?; merger_result.facet_field_ids_delta = Some(facet_field_ids_delta); + // merge and send the facet fst and the searchable facet values + facet_search_builder.merge_and_send(index, rtxn, sender.facet_searchable())?; } } } @@ -294,6 +307,7 @@ fn merge_and_send_facet_docids( buffer: &mut Vec, docids_sender: impl DocidsSender, facet_field_ids_delta: &mut FacetFieldIdsDelta, + facet_search_builder: &mut FacetSearchBuilder, ) -> Result<()> { let mut merger_iter = merger.into_stream_merger_iter().unwrap(); while let Some((key, deladd)) = merger_iter.next().unwrap() { @@ -305,11 +319,13 @@ fn merge_and_send_facet_docids( match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { facet_field_ids_delta.register_from_key(key); + facet_search_builder.register_from_key(DelAdd::Addition, key)?; let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer); docids_sender.write(key, value).unwrap(); } Operation::Delete => { facet_field_ids_delta.register_from_key(key); + facet_search_builder.register_from_key(DelAdd::Deletion, key)?; docids_sender.delete(key).unwrap(); } Operation::Ignore => (), diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 37ccc75cd..16a6dd092 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -8,6 +8,8 @@ mod channel; pub mod document; mod document_change; mod extract; +mod facet_search_builder; +mod fst_merger_builder; pub mod indexer; mod merger; mod parallel_iterator_ext; diff --git a/milli/src/update/new/word_fst_builder.rs b/milli/src/update/new/word_fst_builder.rs index 867d3e86d..834266045 100644 --- a/milli/src/update/new/word_fst_builder.rs +++ b/milli/src/update/new/word_fst_builder.rs @@ -1,4 +1,4 @@ -use std::{fs::File, io::BufWriter}; +use std::io::BufWriter; use fst::{Set, SetBuilder, Streamer}; use memmap2::Mmap; @@ -7,23 +7,19 @@ use tempfile::tempfile; use crate::{index::PrefixSettings, update::del_add::DelAdd, InternalError, Prefix, Result}; +use super::fst_merger_builder::FstMergerBuilder; + pub struct WordFstBuilder<'a> { - stream: Option>, - word_fst_builder: SetBuilder>, - last_word: Option>, + word_fst_builder: FstMergerBuilder<'a>, prefix_fst_builder: Option, - inserted_words: usize, registered_words: usize, } impl<'a> WordFstBuilder<'a> { pub fn new(words_fst: &'a Set>) -> Result { Ok(Self { - stream: Some(words_fst.stream()), - word_fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?, + word_fst_builder: FstMergerBuilder::new(Some(words_fst))?, prefix_fst_builder: None, - last_word: None, - inserted_words: 0, registered_words: 0, }) } @@ -38,100 +34,13 @@ impl<'a> WordFstBuilder<'a> { self.registered_words += 1; } - if let Some(left) = self.last_word.take() { - let (left_inserted, right_inserted) = - self.compare_and_insert(deladd, left.as_slice(), right)?; - - // left was not inserted, so we keep it for the next iteration - if !left_inserted { - self.last_word = Some(left); + self.word_fst_builder.register(deladd, right, &mut |bytes, deladd, is_modified| { + if let Some(prefix_fst_builder) = &mut self.prefix_fst_builder { + prefix_fst_builder.insert_word(bytes, deladd, is_modified) + } else { + Ok(()) } - - // right was inserted, so we can stop - if right_inserted { - return Ok(()); - } - } - - if let Some(mut stream) = self.stream.take() { - while let Some(left) = stream.next() { - let (left_inserted, right_inserted) = - self.compare_and_insert(deladd, left, right)?; - - // left was not inserted, so we keep it for the next iteration - if !left_inserted { - self.last_word = Some(left.to_vec()); - } - - // right was inserted, so we can stop - if right_inserted { - self.stream = Some(stream); - return Ok(()); - } - } - - // If we reach this point, it means that the stream is empty - // and we need to insert the incoming word - self.insert_word(right, deladd, true)?; - - self.stream = Some(stream); - } - - Ok(()) - } - - pub fn compare_and_insert( - &mut self, - deladd: DelAdd, - left: &[u8], - right: &[u8], - ) -> Result<(bool, bool)> { - let mut left_inserted = false; - let mut right_inserted = false; - match left.cmp(right) { - std::cmp::Ordering::Less => { - // We need to insert the last word from the current fst - self.insert_word(left, DelAdd::Addition, false)?; - - left_inserted = true; - } - std::cmp::Ordering::Equal => { - self.insert_word(right, deladd, true)?; - - left_inserted = true; - right_inserted = true; - } - std::cmp::Ordering::Greater => { - self.insert_word(right, deladd, true)?; - - right_inserted = true; - } - } - - Ok((left_inserted, right_inserted)) - } - - fn insert_word(&mut self, bytes: &[u8], deladd: DelAdd, is_modified: bool) -> Result<()> { - // Addition: We insert the word - // Deletion: We delete the word by not inserting it - if deladd == DelAdd::Addition { - self.inserted_words += 1; - self.word_fst_builder.insert(bytes)?; - } - - if let Some(prefix_fst_builder) = self.prefix_fst_builder.as_mut() { - prefix_fst_builder.insert_word(bytes, deladd, is_modified)?; - } - - Ok(()) - } - - fn drain_stream(&mut self) -> Result<()> { - if let Some(mut stream) = self.stream.take() { - while let Some(current) = stream.next() { - self.insert_word(current, DelAdd::Addition, false)?; - } - } + })?; Ok(()) } @@ -141,13 +50,13 @@ impl<'a> WordFstBuilder<'a> { index: &crate::Index, rtxn: &heed::RoTxn, ) -> Result<(Mmap, Option)> { - self.drain_stream()?; - - let words_fst_file = - self.word_fst_builder.into_inner()?.into_inner().map_err(|_| { - InternalError::IndexingMergingKeys { process: "building-words-fst" } - })?; - let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? }; + let words_fst_mmap = self.word_fst_builder.build(&mut |bytes, deladd, is_modified| { + if let Some(prefix_fst_builder) = &mut self.prefix_fst_builder { + prefix_fst_builder.insert_word(bytes, deladd, is_modified) + } else { + Ok(()) + } + })?; let prefix_data = self .prefix_fst_builder From 124b5c3df8978bb823a9f11a8eab6ba29019639b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 21 Oct 2024 10:35:44 +0200 Subject: [PATCH 158/247] Update raw collections --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 5cd1f3976..2597e1f6b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4434,7 +4434,7 @@ dependencies = [ [[package]] name = "raw-collections" version = "0.1.0" -source = "git+https://github.com/dureuill/raw-collections.git#0ecd143c1707d237e3c4d749bc685418da2fccc2" +source = "git+https://github.com/dureuill/raw-collections.git#c82184f144aef3ef7cd3992868a97b586fc94cfd" dependencies = [ "allocator-api2", "bumpalo", From 73e29ee155cf63d45101427ad50b1e765d9c4e55 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 21 Oct 2024 10:35:56 +0200 Subject: [PATCH 159/247] EmbeddingSender stub --- milli/src/update/new/channel.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 8226046e6..5c206b5ba 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -12,6 +12,7 @@ use super::StdResult; use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; use crate::update::new::KvReaderFieldId; use crate::update::MergeDeladdCboRoaringBitmaps; +use crate::vector::Embeddings; use crate::{DocumentId, Index}; /// The capacity of the channel is currently in number of messages. @@ -489,6 +490,23 @@ impl DocumentsSender<'_> { } } +pub struct EmbeddingSender<'a>(Option<&'a Sender>); + +impl EmbeddingSender<'_> { + pub fn delete_embeddings(docid: DocumentId, embedder_id: u8) -> StdResult<(), SendError<()>> { + todo!() + } + + pub fn set_embeddings( + docid: DocumentId, + embedder_id: u8, + embeddings: Embeddings, + ) -> StdResult<(), SendError<()>> { + todo!() + } + + pub fn finish_embedder(embedder_id: u8) {} +} pub enum MergerOperation { ExactWordDocidsMerger(Merger), FidWordCountDocidsMerger(Merger), From c278024709cfaaf90f75d3ea5f04ed9e4034ec7b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 21 Oct 2024 10:36:27 +0200 Subject: [PATCH 160/247] Add vectors field and geo field to document trait --- milli/src/update/new/document.rs | 150 +++++++++++++++--- milli/src/update/new/document_change.rs | 9 -- .../update/new/indexer/document_operation.rs | 57 ++++--- 3 files changed, 158 insertions(+), 58 deletions(-) diff --git a/milli/src/update/new/document.rs b/milli/src/update/new/document.rs index 1fb31ceb8..4948f8e31 100644 --- a/milli/src/update/new/document.rs +++ b/milli/src/update/new/document.rs @@ -1,9 +1,9 @@ use std::collections::BTreeSet; use heed::RoTxn; +use raw_collections::RawMap; use serde_json::value::RawValue; -use super::document_change::{Entry, Versions}; use super::{KvReaderFieldId, KvWriterFieldId}; use crate::documents::FieldIdMapper; use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; @@ -17,11 +17,26 @@ pub trait Document<'doc> { /// Iterate over all **top-level** fields of the document, returning their name and raw JSON value. /// /// - The returned values *may* contain nested fields. - /// - The `_vectors` field is **ignored** by this method, meaning it is **not returned** by this method. + /// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning they are **not returned** by this method. fn iter_top_level_fields(&self) -> impl Iterator>; + + /// Returns the unparsed value of the `_vectors` field from the document data. + /// + /// This field alone is insufficient to retrieve vectors, as they may be stored in a dedicated location in the database. + /// Use a [`super::vector_document::VectorDocument`] to access the vector. + /// + /// This method is meant as a convenience for implementors of [`super::vector_document::VectorDocument`]. + fn vectors_field(&self) -> Result>; + + /// Returns the unparsed value of the `_geo` field from the document data. + /// + /// This field alone is insufficient to retrieve geo data, as they may be stored in a dedicated location in the database. + /// Use a [`super::geo_document::GeoDocument`] to access the vector. + /// + /// This method is meant as a convenience for implementors of [`super::geo_document::GeoDocument`]. + fn geo_field(&self) -> Result>; } -#[derive(Clone, Copy)] pub struct DocumentFromDb<'t, Mapper: FieldIdMapper> where Mapper: FieldIdMapper, @@ -30,6 +45,14 @@ where content: &'t KvReaderFieldId, } +impl<'t, Mapper: FieldIdMapper> Clone for DocumentFromDb<'t, Mapper> { + #[inline] + fn clone(&self) -> Self { + *self + } +} +impl<'t, Mapper: FieldIdMapper> Copy for DocumentFromDb<'t, Mapper> {} + impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { fn iter_top_level_fields(&self) -> impl Iterator> { let mut it = self.content.iter(); @@ -53,6 +76,14 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { Some(res) }) } + + fn vectors_field(&self) -> Result> { + self.field(RESERVED_VECTORS_FIELD_NAME) + } + + fn geo_field(&self) -> Result> { + self.field("_geo") + } } impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> { @@ -66,6 +97,14 @@ impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> { reader.map(|reader| Self { fields_ids_map: db_fields_ids_map, content: reader }) }) } + + pub fn field(&self, name: &str) -> Result> { + let Some(fid) = self.fields_ids_map.id(name) else { + return Ok(None); + }; + let Some(value) = self.content.get(fid) else { return Ok(None) }; + Ok(Some(serde_json::from_slice(value).map_err(InternalError::SerdeJson)?)) + } } #[derive(Clone, Copy)] @@ -81,29 +120,15 @@ impl<'doc> DocumentFromVersions<'doc> { impl<'doc> Document<'doc> for DocumentFromVersions<'doc> { fn iter_top_level_fields(&self) -> impl Iterator> { - match &self.versions { - Versions::Single(version) => either::Either::Left(version.iter_top_level_fields()), - Versions::Multiple(versions) => { - let mut seen_fields = BTreeSet::new(); - let mut it = versions.iter().rev().flat_map(|version| version.iter()).copied(); - either::Either::Right(std::iter::from_fn(move || loop { - let (name, value) = it.next()?; - - if seen_fields.contains(name) { - continue; - } - seen_fields.insert(name); - return Some(Ok((name, value))); - })) - } - } + self.versions.iter_top_level_fields().map(Ok) } -} -// used in document from payload -impl<'doc> Document<'doc> for &'doc [Entry<'doc>] { - fn iter_top_level_fields(&self) -> impl Iterator>> { - self.iter().copied().map(|(k, v)| Ok((k, v))) + fn vectors_field(&self) -> Result> { + Ok(self.versions.vectors_field()) + } + + fn geo_field(&self) -> Result> { + Ok(self.versions.geo_field()) } } @@ -164,6 +189,26 @@ impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d> } }) } + + fn vectors_field(&self) -> Result> { + if let Some(vectors) = self.new_doc.vectors_field()? { + return Ok(Some(vectors)); + } + + let Some(db) = self.db else { return Ok(None) }; + + db.vectors_field() + } + + fn geo_field(&self) -> Result> { + if let Some(geo) = self.new_doc.geo_field()? { + return Ok(Some(geo)); + } + + let Some(db) = self.db else { return Ok(None) }; + + db.geo_field() + } } impl<'doc, D> Document<'doc> for &D @@ -173,6 +218,14 @@ where fn iter_top_level_fields(&self) -> impl Iterator> { D::iter_top_level_fields(self) } + + fn vectors_field(&self) -> Result> { + D::vectors_field(self) + } + + fn geo_field(&self) -> Result> { + D::geo_field(self) + } } /// Turn this document into an obkv, whose fields are indexed by the provided `FieldIdMapper`. @@ -245,3 +298,52 @@ where writer.finish().unwrap(); Ok(KvReaderFieldId::from_slice(document_buffer)) } + +pub type Entry<'doc> = (&'doc str, &'doc RawValue); + +#[derive(Clone, Copy)] +pub struct Versions<'doc> { + data: &'doc [Entry<'doc>], + vectors: Option<&'doc RawValue>, + geo: Option<&'doc RawValue>, +} + +impl<'doc> Versions<'doc> { + pub fn multiple( + mut versions: impl Iterator>>, + ) -> Result> { + let Some(data) = versions.next() else { return Ok(None) }; + let mut data = data?; + for future_version in versions { + let future_version = future_version?; + for (field, value) in future_version { + data.insert(field, value); + } + } + Ok(Some(Self::single(data))) + } + + pub fn single(version: RawMap<'doc>) -> Self { + let vectors_id = version.get_index(RESERVED_VECTORS_FIELD_NAME); + let geo_id = version.get_index("_geo"); + let mut data = version.into_vec(); + let geo = geo_id.map(|geo_id| data.remove(geo_id).1); + let vectors = vectors_id.map(|vectors_id| data.remove(vectors_id).1); + + let data = data.into_bump_slice(); + + Self { data, geo, vectors } + } + + pub fn iter_top_level_fields(&self) -> impl Iterator> { + self.data.iter().copied() + } + + pub fn vectors_field(&self) -> Option<&'doc RawValue> { + self.vectors + } + + pub fn geo_field(&self) -> Option<&'doc RawValue> { + self.geo + } +} diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index 63b878854..f277637d5 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -1,5 +1,4 @@ use heed::RoTxn; -use serde_json::value::RawValue; use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument}; use crate::documents::FieldIdMapper; @@ -138,11 +137,3 @@ impl<'doc> Update<'doc> { } } } - -pub type Entry<'doc> = (&'doc str, &'doc RawValue); - -#[derive(Clone, Copy)] -pub enum Versions<'doc> { - Single(&'doc [Entry<'doc>]), - Multiple(&'doc [&'doc [Entry<'doc>]]), -} diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index ee4517e20..007b56643 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -2,7 +2,6 @@ use bumpalo::collections::CollectIn; use bumpalo::Bump; use heed::RoTxn; use memmap2::Mmap; -use rayon::iter::IntoParallelIterator; use rayon::slice::ParallelSlice; use serde_json::value::RawValue; use IndexDocumentsMethod as Idm; @@ -10,8 +9,7 @@ use IndexDocumentsMethod as Idm; use super::super::document_change::DocumentChange; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; use crate::documents::PrimaryKey; -use crate::update::new::document::DocumentFromVersions; -use crate::update::new::document_change::Versions; +use crate::update::new::document::{DocumentFromVersions, Versions}; use crate::update::new::{Deletion, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; @@ -291,8 +289,7 @@ impl MergeChanges for MergeDocumentForReplacement { let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) .map_err(UserError::SerdeJson)?; - let document = document.into_bump_slice(); - let document = DocumentFromVersions::new(Versions::Single(document)); + let document = DocumentFromVersions::new(Versions::single(document)); if is_new { Ok(Some(DocumentChange::Insertion(Insertion::create( @@ -365,30 +362,40 @@ impl MergeChanges for MergeDocumentForUpdates { }; } - let mut versions = bumpalo::collections::Vec::with_capacity_in(operations.len(), doc_alloc); + let versions = match operations { + [single] => { + let DocumentOffset { content } = match single { + InnerDocOp::Addition(offset) => offset, + InnerDocOp::Deletion => { + unreachable!("Deletion in document operations") + } + }; + let document = serde_json::from_slice(content).unwrap(); + let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) + .map_err(UserError::SerdeJson)?; - for operation in operations { - let DocumentOffset { content } = match operation { - InnerDocOp::Addition(offset) => offset, - InnerDocOp::Deletion => { - unreachable!("Deletion in document operations") - } - }; + Some(Versions::single(document)) + } + operations => { + let versions = operations.iter().map(|operation| { + let DocumentOffset { content } = match operation { + InnerDocOp::Addition(offset) => offset, + InnerDocOp::Deletion => { + unreachable!("Deletion in document operations") + } + }; - let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; - - let document = document.into_bump_slice(); - versions.push(document); - } - - let versions = versions.into_bump_slice(); - let versions = match versions { - [single] => Versions::Single(single), - versions => Versions::Multiple(versions), + let document = serde_json::from_slice(content).unwrap(); + let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) + .map_err(UserError::SerdeJson)?; + Ok(document) + }); + Versions::multiple(versions)? + } }; + let Some(versions) = versions else { return Ok(None) }; + let document = DocumentFromVersions::new(versions); if is_new { From 1a3f4e719d2e21d4298bdee0eaf1125cdce2048b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 21 Oct 2024 10:38:21 +0200 Subject: [PATCH 161/247] Vector document trait --- milli/src/update/new/mod.rs | 1 + milli/src/update/new/vector_document.rs | 134 ++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 milli/src/update/new/vector_document.rs diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 37ccc75cd..6b59b5b59 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -12,6 +12,7 @@ pub mod indexer; mod merger; mod parallel_iterator_ext; mod top_level_map; +pub mod vector_document; mod word_fst_builder; mod words_prefix_docids; diff --git a/milli/src/update/new/vector_document.rs b/milli/src/update/new/vector_document.rs new file mode 100644 index 000000000..375d4f2ce --- /dev/null +++ b/milli/src/update/new/vector_document.rs @@ -0,0 +1,134 @@ +use bumpalo::Bump; +use heed::RoTxn; +use raw_collections::RawMap; +use serde::Serialize; +use serde_json::value::RawValue; + +use super::document::{Document, DocumentFromDb}; +use crate::documents::FieldIdMapper; +use crate::index::IndexEmbeddingConfig; +use crate::vector::parsed_vectors::RawVectors; +use crate::vector::Embedding; +use crate::{DocumentId, Index, InternalError, Result}; + +#[derive(Serialize)] +#[serde(untagged)] +pub enum Embeddings<'doc> { + FromJson(&'doc RawValue), + FromDb(Vec), +} + +pub struct VectorEntry<'doc> { + pub has_configured_embedder: bool, + pub embeddings: Option>, + pub regenerate: bool, +} + +pub trait VectorDocument<'doc> { + fn iter_vectors(&self) -> impl Iterator)>>; + + fn vectors_for_key(&self, key: &str) -> Result>>; +} + +pub struct VectorDocumentFromDb<'t> { + docid: DocumentId, + embedding_config: Vec, + index: &'t Index, + vectors_field: Option>, + rtxn: &'t RoTxn<'t>, + doc_alloc: &'t Bump, +} + +impl<'t> VectorDocumentFromDb<'t> { + pub fn new( + docid: DocumentId, + index: &'t Index, + rtxn: &'t RoTxn, + db_fields_ids_map: &'t Mapper, + doc_alloc: &'t Bump, + ) -> Result { + let document = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)?.unwrap(); + let vectors = document.vectors_field()?; + let vectors_field = match vectors { + Some(vectors) => { + Some(RawMap::from_raw_value(vectors, doc_alloc).map_err(InternalError::SerdeJson)?) + } + None => None, + }; + + let embedding_config = index.embedding_configs(rtxn)?; + + Ok(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc }) + } + + fn entry_from_db( + &self, + embedder_id: u8, + config: &IndexEmbeddingConfig, + ) -> Result> { + let readers = self.index.arroy_readers(self.rtxn, embedder_id, config.config.quantized()); + let mut vectors = Vec::new(); + for reader in readers { + let reader = reader?; + let Some(vector) = reader.item_vector(self.rtxn, self.docid)? else { + break; + }; + + vectors.push(vector); + } + Ok(VectorEntry { + has_configured_embedder: true, + embeddings: Some(Embeddings::FromDb(vectors)), + regenerate: !config.user_provided.contains(self.docid), + }) + } +} + +impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { + fn iter_vectors(&self) -> impl Iterator)>> { + self.embedding_config + .iter() + .map(|config| { + let embedder_id = + self.index.embedder_category_id.get(self.rtxn, &config.name)?.unwrap(); + let entry = self.entry_from_db(embedder_id, config)?; + let config_name = self.doc_alloc.alloc_str(config.name.as_str()); + Ok((&*config_name, entry)) + }) + .chain(self.vectors_field.iter().map(|map| map.iter()).flatten().map( + |(name, value)| { + Ok(( + name.as_ref(), + entry_from_raw_value(value).map_err(InternalError::SerdeJson)?, + )) + }, + )) + } + + fn vectors_for_key(&self, key: &str) -> Result>> { + Ok(match self.index.embedder_category_id.get(self.rtxn, key)? { + Some(embedder_id) => { + let config = + self.embedding_config.iter().find(|config| config.name == key).unwrap(); + Some(self.entry_from_db(embedder_id, config)?) + } + None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) { + Some(embedding_from_doc) => Some( + entry_from_raw_value(embedding_from_doc).map_err(InternalError::SerdeJson)?, + ), + None => None, + }, + }) + } +} + +fn entry_from_raw_value( + value: &RawValue, +) -> std::result::Result, serde_json::Error> { + let value: RawVectors = serde_json::from_str(value.get())?; + Ok(VectorEntry { + has_configured_embedder: false, + embeddings: value.embeddings().map(|embeddings| Embeddings::FromJson(embeddings)), + regenerate: value.must_regenerate(), + }) +} From aff8ca4397c5e1447f2ade2903c5f0ac910c7c5c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 21 Oct 2024 10:39:05 +0200 Subject: [PATCH 162/247] Add raw versions of parsed vectors --- milli/src/vector/parsed_vectors.rs | 33 ++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 8e5ccf690..526516fef 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -2,6 +2,7 @@ use std::collections::{BTreeMap, BTreeSet}; use deserr::{take_cf_content, DeserializeError, Deserr, Sequence}; use obkv::KvReader; +use serde_json::value::RawValue; use serde_json::{from_slice, Value}; use super::Embedding; @@ -11,6 +12,13 @@ use crate::{DocumentId, FieldId, InternalError, UserError}; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; +#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[serde(untagged)] +pub enum RawVectors<'doc> { + Explicit(#[serde(borrow)] RawExplicitVectors<'doc>), + ImplicitlyUserProvided(#[serde(borrow)] &'doc RawValue), +} + #[derive(serde::Serialize, Debug)] #[serde(untagged)] pub enum Vectors { @@ -69,6 +77,22 @@ impl Vectors { } } +impl<'doc> RawVectors<'doc> { + pub fn must_regenerate(&self) -> bool { + match self { + RawVectors::ImplicitlyUserProvided(_) => false, + RawVectors::Explicit(RawExplicitVectors { regenerate, .. }) => *regenerate, + } + } + + pub fn embeddings(&self) -> Option<&'doc RawValue> { + match self { + RawVectors::ImplicitlyUserProvided(embeddings) => Some(embeddings), + RawVectors::Explicit(RawExplicitVectors { embeddings, regenerate: _ }) => *embeddings, + } + } +} + #[derive(serde::Serialize, Deserr, Debug)] #[serde(rename_all = "camelCase")] pub struct ExplicitVectors { @@ -78,6 +102,15 @@ pub struct ExplicitVectors { pub regenerate: bool, } +#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct RawExplicitVectors<'doc> { + #[serde(borrow)] + #[serde(default)] + pub embeddings: Option<&'doc RawValue>, + pub regenerate: bool, +} + pub enum VectorState { Inline(Vectors), Manual, From 9fe5122176cb1863943b323e598e83d8e7fe81d2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 21 Oct 2024 10:39:31 +0200 Subject: [PATCH 163/247] Fixup imports --- .../update/new/extract/searchable/tokenize_document.rs | 5 +++-- milli/src/update/new/indexer/partial_dump.rs | 10 +++------- milli/src/update/new/indexer/update_by_function.rs | 8 +++----- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index b8fd24f1b..5428907f8 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -171,12 +171,12 @@ mod test { use bumpalo::Bump; use charabia::TokenizerBuilder; use meili_snap::snapshot; - use raw_collections::RawMap; use serde_json::json; use serde_json::value::RawValue; use super::*; + use crate::update::new::document::{DocumentFromVersions, Versions}; use crate::FieldsIdsMap; #[test] @@ -222,7 +222,8 @@ mod test { let bump = Bump::new(); let document: &RawValue = serde_json::from_str(&document).unwrap(); let document = RawMap::from_raw_value(document, &bump).unwrap(); - let document = document.into_bump_slice(); + + let document = DocumentFromVersions::new(Versions::single(document)); document_tokenizer .tokenize_document( diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 470dbc9d5..60cb627e9 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -1,15 +1,12 @@ use std::ops::DerefMut; use rayon::iter::IndexedParallelIterator; -use serde::Deserializer; use serde_json::value::RawValue; -use super::de::FieldAndDocidExtractor; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend, RefCellExt}; -use crate::documents::{DocumentIdExtractionError, PrimaryKey}; +use crate::documents::PrimaryKey; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; -use crate::update::new::document::DocumentFromVersions; -use crate::update::new::document_change::Versions; +use crate::update::new::document::{DocumentFromVersions, Versions}; use crate::update::new::{DocumentChange, Insertion}; use crate::{Error, InternalError, Result, UserError}; @@ -79,8 +76,7 @@ where let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) .map_err(InternalError::SerdeJson)?; - let document = document.into_bump_slice(); - let document = DocumentFromVersions::new(Versions::Single(document)); + let document = DocumentFromVersions::new(Versions::single(document)); let insertion = Insertion::create(docid, external_document_id, document); Ok(Some(DocumentChange::Insertion(insertion))) diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index 4c65edcc3..cff0e02fc 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -9,8 +9,7 @@ use super::DocumentChanges; use crate::documents::Error::InvalidDocumentFormat; use crate::documents::PrimaryKey; use crate::error::{FieldIdMapMissingEntry, InternalError}; -use crate::update::new::document::DocumentFromVersions; -use crate::update::new::document_change::Versions; +use crate::update::new::document::{DocumentFromVersions, Versions}; use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update}; use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; @@ -161,9 +160,8 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { } else { let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc) .map_err(InternalError::SerdeJson)?; - let new_doc_version = DocumentFromVersions::new(Versions::Single( - raw_new_doc.into_bump_slice(), - )); + let new_doc_version = + DocumentFromVersions::new(Versions::single(raw_new_doc)); Ok(Some(DocumentChange::Update(Update::create( docid, new_document_id, From 89243f7df0b36af5284175bbf0c546e6b4726889 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 21 Oct 2024 10:39:40 +0200 Subject: [PATCH 164/247] WIP vector extraction --- milli/src/update/new/extract/mod.rs | 1 + milli/src/update/new/indexer/mod.rs | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 1c86d80af..5a63dccfa 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -2,6 +2,7 @@ mod cache; mod faceted; mod lru; mod searchable; +mod vectors; use std::cell::RefCell; use std::fs::File; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index d4e6ca6a6..0fc7940bb 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -213,6 +213,19 @@ where )?; } + 'vectors: { + let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); + let _entered = span.enter(); + + let index_embeddings = index.embedding_configs(&rtxn)?; + if index_embeddings.is_empty() { + break 'vectors; + } + for index_embedding in index_embeddings { + + } + } + { let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); let _entered = span.enter(); From 50de3fba7b22f34744ed9955961bf0c5fffc4a69 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:07:23 +0100 Subject: [PATCH 165/247] Update raw-collections --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 2597e1f6b..4c40d249e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4434,7 +4434,7 @@ dependencies = [ [[package]] name = "raw-collections" version = "0.1.0" -source = "git+https://github.com/dureuill/raw-collections.git#c82184f144aef3ef7cd3992868a97b586fc94cfd" +source = "git+https://github.com/dureuill/raw-collections.git#4ab9619207632c20f4e0c2e126d9d909cc58ef65" dependencies = [ "allocator-api2", "bumpalo", From c22dc556945835cd73af4c6f3d29d0a33a1cf1a4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:08:54 +0100 Subject: [PATCH 166/247] Add embed_chunks_ref --- milli/src/vector/hf.rs | 35 +++++++++++++++++++------ milli/src/vector/manual.rs | 19 +++++++++----- milli/src/vector/mod.rs | 38 ++++++++++++++++------------ milli/src/vector/ollama.rs | 36 ++++++++++++++++++++++---- milli/src/vector/openai.rs | 49 +++++++++++++++++++++++++---------- milli/src/vector/rest.rs | 52 +++++++++++++++++++++++++------------- 6 files changed, 163 insertions(+), 66 deletions(-) diff --git a/milli/src/vector/hf.rs b/milli/src/vector/hf.rs index dc1e7d324..ea892ca57 100644 --- a/milli/src/vector/hf.rs +++ b/milli/src/vector/hf.rs @@ -7,7 +7,7 @@ use hf_hub::{Repo, RepoType}; use tokenizers::{PaddingParams, Tokenizer}; pub use super::error::{EmbedError, Error, NewEmbedderError}; -use super::{DistributionShift, Embedding, Embeddings}; +use super::{DistributionShift, Embedding}; #[derive( Debug, @@ -139,15 +139,12 @@ impl Embedder { let embeddings = this .embed(vec!["test".into()]) .map_err(NewEmbedderError::could_not_determine_dimension)?; - this.dimensions = embeddings.first().unwrap().dimension(); + this.dimensions = embeddings.first().unwrap().len(); Ok(this) } - pub fn embed( - &self, - mut texts: Vec, - ) -> std::result::Result>, EmbedError> { + pub fn embed(&self, mut texts: Vec) -> std::result::Result, EmbedError> { let tokens = match texts.len() { 1 => vec![self .tokenizer @@ -177,13 +174,31 @@ impl Embedder { .map_err(EmbedError::tensor_shape)?; let embeddings: Vec = embeddings.to_vec2().map_err(EmbedError::tensor_shape)?; - Ok(embeddings.into_iter().map(Embeddings::from_single_embedding).collect()) + Ok(embeddings) + } + + pub fn embed_one(&self, text: &str) -> std::result::Result { + let tokens = self.tokenizer.encode(text, true).map_err(EmbedError::tokenize)?; + let token_ids = tokens.get_ids(); + let token_ids = if token_ids.len() > 512 { &token_ids[..512] } else { token_ids }; + let token_ids = + Tensor::new(token_ids, &self.model.device).map_err(EmbedError::tensor_shape)?; + let token_type_ids = token_ids.zeros_like().map_err(EmbedError::tensor_shape)?; + let embeddings = + self.model.forward(&token_ids, &token_type_ids).map_err(EmbedError::model_forward)?; + + // Apply some avg-pooling by taking the mean embedding value for all tokens (including padding) + let (n_tokens, _hidden_size) = embeddings.dims2().map_err(EmbedError::tensor_shape)?; + let embedding = (embeddings.sum(0).map_err(EmbedError::tensor_value)? / (n_tokens as f64)) + .map_err(EmbedError::tensor_shape)?; + let embedding: Embedding = embedding.to_vec1().map_err(EmbedError::tensor_shape)?; + Ok(embedding) } pub fn embed_chunks( &self, text_chunks: Vec>, - ) -> std::result::Result>>, EmbedError> { + ) -> std::result::Result>, EmbedError> { text_chunks.into_iter().map(|prompts| self.embed(prompts)).collect() } @@ -211,4 +226,8 @@ impl Embedder { } }) } + + pub(crate) fn embed_chunks_ref(&self, texts: &[&str]) -> Result, EmbedError> { + texts.iter().map(|text| self.embed_one(text)).collect() + } } diff --git a/milli/src/vector/manual.rs b/milli/src/vector/manual.rs index 4cfbb0d3c..8c2ef97b2 100644 --- a/milli/src/vector/manual.rs +++ b/milli/src/vector/manual.rs @@ -1,5 +1,6 @@ use super::error::EmbedError; -use super::{DistributionShift, Embeddings}; +use super::DistributionShift; +use crate::vector::Embedding; #[derive(Debug, Clone, Copy)] pub struct Embedder { @@ -18,11 +19,13 @@ impl Embedder { Self { dimensions: options.dimensions, distribution: options.distribution } } - pub fn embed(&self, mut texts: Vec) -> Result>, EmbedError> { - let Some(text) = texts.pop() else { return Ok(Default::default()) }; - Err(EmbedError::embed_on_manual_embedder(text.chars().take(250).collect())) + pub fn embed>(&self, texts: &[S]) -> Result, EmbedError> { + texts.as_ref().iter().map(|text| self.embed_one(text)).collect() } + pub fn embed_one>(&self, text: S) -> Result { + Err(EmbedError::embed_on_manual_embedder(text.as_ref().chars().take(250).collect())) + } pub fn dimensions(&self) -> usize { self.dimensions } @@ -30,11 +33,15 @@ impl Embedder { pub fn embed_chunks( &self, text_chunks: Vec>, - ) -> Result>>, EmbedError> { - text_chunks.into_iter().map(|prompts| self.embed(prompts)).collect() + ) -> Result>, EmbedError> { + text_chunks.into_iter().map(|prompts| self.embed(&prompts)).collect() } pub fn distribution(&self) -> Option { self.distribution } + + pub(crate) fn embed_chunks_ref(&self, texts: &[&str]) -> Result, EmbedError> { + texts.iter().map(|text| self.embed_one(text)).collect() + } } diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index d52e68bbe..2e9a498c0 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -376,28 +376,20 @@ impl Embedder { /// Embed one or multiple texts. /// /// Each text can be embedded as one or multiple embeddings. - pub fn embed( - &self, - texts: Vec, - ) -> std::result::Result>, EmbedError> { + pub fn embed(&self, texts: Vec) -> std::result::Result, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed(texts), - Embedder::OpenAi(embedder) => embedder.embed(texts), - Embedder::Ollama(embedder) => embedder.embed(texts), - Embedder::UserProvided(embedder) => embedder.embed(texts), + Embedder::OpenAi(embedder) => embedder.embed(&texts), + Embedder::Ollama(embedder) => embedder.embed(&texts), + Embedder::UserProvided(embedder) => embedder.embed(&texts), Embedder::Rest(embedder) => embedder.embed(texts), } } pub fn embed_one(&self, text: String) -> std::result::Result { - let mut embeddings = self.embed(vec![text])?; - let embeddings = embeddings.pop().ok_or_else(EmbedError::missing_embedding)?; - Ok(if embeddings.iter().nth(1).is_some() { - tracing::warn!("Ignoring embeddings past the first one in long search query"); - embeddings.iter().next().unwrap().to_vec() - } else { - embeddings.into_inner() - }) + let mut embedding = self.embed(vec![text])?; + let embedding = embedding.pop().ok_or_else(EmbedError::missing_embedding)?; + Ok(embedding) } /// Embed multiple chunks of texts. @@ -407,7 +399,7 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - ) -> std::result::Result>>, EmbedError> { + ) -> std::result::Result>, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks), Embedder::OpenAi(embedder) => embedder.embed_chunks(text_chunks, threads), @@ -417,6 +409,20 @@ impl Embedder { } } + pub fn embed_chunks_ref( + &self, + texts: &[&str], + threads: &ThreadPoolNoAbort, + ) -> std::result::Result, EmbedError> { + match self { + Embedder::HuggingFace(embedder) => embedder.embed_chunks_ref(texts), + Embedder::OpenAi(embedder) => embedder.embed_chunks_ref(texts, threads), + Embedder::Ollama(embedder) => embedder.embed_chunks_ref(texts, threads), + Embedder::UserProvided(embedder) => embedder.embed_chunks_ref(texts), + Embedder::Rest(embedder) => embedder.embed_chunks_ref(texts, threads), + } + } + /// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`] pub fn chunk_count_hint(&self) -> usize { match self { diff --git a/milli/src/vector/ollama.rs b/milli/src/vector/ollama.rs index 7d41ab4e9..65fd05416 100644 --- a/milli/src/vector/ollama.rs +++ b/milli/src/vector/ollama.rs @@ -1,9 +1,11 @@ use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; +use rayon::slice::ParallelSlice as _; use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; -use super::{DistributionShift, Embeddings}; +use super::DistributionShift; use crate::error::FaultSource; +use crate::vector::Embedding; use crate::ThreadPoolNoAbort; #[derive(Debug)] @@ -75,8 +77,11 @@ impl Embedder { Ok(Self { rest_embedder }) } - pub fn embed(&self, texts: Vec) -> Result>, EmbedError> { - match self.rest_embedder.embed(texts) { + pub fn embed + serde::Serialize>( + &self, + texts: &[S], + ) -> Result, EmbedError> { + match self.rest_embedder.embed_ref(texts) { Ok(embeddings) => Ok(embeddings), Err(EmbedError { kind: EmbedErrorKind::RestOtherStatusCode(404, error), fault: _ }) => { Err(EmbedError::ollama_model_not_found(error)) @@ -89,10 +94,31 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - ) -> Result>>, EmbedError> { + ) -> Result>, EmbedError> { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() + text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } + + pub(crate) fn embed_chunks_ref( + &self, + texts: &[&str], + threads: &ThreadPoolNoAbort, + ) -> Result>, EmbedError> { + threads + .install(move || { + let embeddings: Result>, _> = texts + .par_chunks(self.chunk_count_hint()) + .map(move |chunk| self.embed(chunk)) + .collect(); + + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index 152d1fb7a..466fd1660 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -1,11 +1,13 @@ use ordered_float::OrderedFloat; use rayon::iter::{IntoParallelIterator, ParallelIterator as _}; +use rayon::slice::ParallelSlice as _; use super::error::{EmbedError, NewEmbedderError}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; -use super::{DistributionShift, Embeddings}; +use super::DistributionShift; use crate::error::FaultSource; use crate::vector::error::EmbedErrorKind; +use crate::vector::Embedding; use crate::ThreadPoolNoAbort; #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] @@ -206,22 +208,26 @@ impl Embedder { Ok(Self { options, rest_embedder, tokenizer }) } - pub fn embed(&self, texts: Vec) -> Result>, EmbedError> { - match self.rest_embedder.embed_ref(&texts) { + pub fn embed + serde::Serialize>( + &self, + texts: &[S], + ) -> Result, EmbedError> { + match self.rest_embedder.embed_ref(texts) { Ok(embeddings) => Ok(embeddings), Err(EmbedError { kind: EmbedErrorKind::RestBadRequest(error, _), fault: _ }) => { tracing::warn!(error=?error, "OpenAI: received `BAD_REQUEST`. Input was maybe too long, retrying on tokenized version. For best performance, limit the size of your document template."); - self.try_embed_tokenized(&texts) + self.try_embed_tokenized(texts) } Err(error) => Err(error), } } - fn try_embed_tokenized(&self, text: &[String]) -> Result>, EmbedError> { + fn try_embed_tokenized>(&self, text: &[S]) -> Result, EmbedError> { let mut all_embeddings = Vec::with_capacity(text.len()); for text in text { + let text = text.as_ref(); let max_token_count = self.options.embedding_model.max_token(); - let encoded = self.tokenizer.encode_ordinary(text.as_str()); + let encoded = self.tokenizer.encode_ordinary(text); let len = encoded.len(); if len < max_token_count { all_embeddings.append(&mut self.rest_embedder.embed_ref(&[text])?); @@ -229,14 +235,10 @@ impl Embedder { } let tokens = &encoded.as_slice()[0..max_token_count]; - let mut embeddings_for_prompt = Embeddings::new(self.dimensions()); let embedding = self.rest_embedder.embed_tokens(tokens)?; - embeddings_for_prompt.append(embedding.into_inner()).map_err(|got| { - EmbedError::rest_unexpected_dimension(self.dimensions(), got.len()) - })?; - all_embeddings.push(embeddings_for_prompt); + all_embeddings.push(embedding); } Ok(all_embeddings) } @@ -245,10 +247,31 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - ) -> Result>>, EmbedError> { + ) -> Result>, EmbedError> { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() + text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } + + pub(crate) fn embed_chunks_ref( + &self, + texts: &[&str], + threads: &ThreadPoolNoAbort, + ) -> Result>, EmbedError> { + threads + .install(move || { + let embeddings: Result>, _> = texts + .par_chunks(self.chunk_count_hint()) + .map(move |chunk| self.embed(chunk)) + .collect(); + + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index 2538f2fff..dc2ab95f9 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -3,13 +3,12 @@ use std::collections::BTreeMap; use deserr::Deserr; use rand::Rng; use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; +use rayon::slice::ParallelSlice as _; use serde::{Deserialize, Serialize}; use super::error::EmbedErrorKind; use super::json_template::ValueTemplate; -use super::{ - DistributionShift, EmbedError, Embedding, Embeddings, NewEmbedderError, REQUEST_PARALLELISM, -}; +use super::{DistributionShift, EmbedError, Embedding, NewEmbedderError, REQUEST_PARALLELISM}; use crate::error::FaultSource; use crate::ThreadPoolNoAbort; @@ -154,18 +153,18 @@ impl Embedder { Ok(Self { data, dimensions, distribution: options.distribution }) } - pub fn embed(&self, texts: Vec) -> Result>, EmbedError> { + pub fn embed(&self, texts: Vec) -> Result, EmbedError> { embed(&self.data, texts.as_slice(), texts.len(), Some(self.dimensions)) } - pub fn embed_ref(&self, texts: &[S]) -> Result>, EmbedError> + pub fn embed_ref(&self, texts: &[S]) -> Result, EmbedError> where S: AsRef + Serialize, { embed(&self.data, texts, texts.len(), Some(self.dimensions)) } - pub fn embed_tokens(&self, tokens: &[usize]) -> Result, EmbedError> { + pub fn embed_tokens(&self, tokens: &[usize]) -> Result { let mut embeddings = embed(&self.data, tokens, 1, Some(self.dimensions))?; // unwrap: guaranteed that embeddings.len() == 1, otherwise the previous line terminated in error Ok(embeddings.pop().unwrap()) @@ -175,7 +174,7 @@ impl Embedder { &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, - ) -> Result>>, EmbedError> { + ) -> Result>, EmbedError> { threads .install(move || { text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() @@ -186,6 +185,27 @@ impl Embedder { })? } + pub(crate) fn embed_chunks_ref( + &self, + texts: &[&str], + threads: &ThreadPoolNoAbort, + ) -> Result, EmbedError> { + threads + .install(move || { + let embeddings: Result>, _> = texts + .par_chunks(self.chunk_count_hint()) + .map(move |chunk| self.embed_ref(chunk)) + .collect(); + + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } + pub fn chunk_count_hint(&self) -> usize { super::REQUEST_PARALLELISM } @@ -210,7 +230,7 @@ fn infer_dimensions(data: &EmbedderData) -> Result { let v = embed(data, ["test"].as_slice(), 1, None) .map_err(NewEmbedderError::could_not_determine_dimension)?; // unwrap: guaranteed that v.len() == 1, otherwise the previous line terminated in error - Ok(v.first().unwrap().dimension()) + Ok(v.first().unwrap().len()) } fn embed( @@ -218,7 +238,7 @@ fn embed( inputs: &[S], expected_count: usize, expected_dimension: Option, -) -> Result>, EmbedError> +) -> Result, EmbedError> where S: Serialize, { @@ -304,7 +324,7 @@ fn response_to_embedding( data: &EmbedderData, expected_count: usize, expected_dimensions: Option, -) -> Result>, EmbedError> { +) -> Result, EmbedError> { let response: serde_json::Value = response.into_json().map_err(EmbedError::rest_response_deserialization)?; @@ -316,11 +336,8 @@ fn response_to_embedding( if let Some(dimensions) = expected_dimensions { for embedding in &embeddings { - if embedding.dimension() != dimensions { - return Err(EmbedError::rest_unexpected_dimension( - dimensions, - embedding.dimension(), - )); + if embedding.len() != dimensions { + return Err(EmbedError::rest_unexpected_dimension(dimensions, embedding.len())); } } } @@ -394,7 +411,7 @@ impl Response { pub fn extract_embeddings( &self, response: serde_json::Value, - ) -> Result>, EmbedError> { + ) -> Result, EmbedError> { let extracted_values: Vec = match self.template.extract(response) { Ok(extracted_values) => extracted_values, Err(error) => { @@ -403,8 +420,7 @@ impl Response { return Err(EmbedError::rest_extraction_error(error_message)); } }; - let embeddings: Vec> = - extracted_values.into_iter().map(Embeddings::from_single_embedding).collect(); + let embeddings: Vec = extracted_values.into_iter().collect(); Ok(embeddings) } From 9e7c455a0177a829e1ff3a536951e51808e04f1f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:09:48 +0100 Subject: [PATCH 167/247] GlobalFieldIdMap manages metadata --- milli/src/fields_ids_map.rs | 1 + milli/src/fields_ids_map/global.rs | 61 ++++++--- milli/src/fields_ids_map/metadata.rs | 184 +++++++++++++++++++++++++++ 3 files changed, 228 insertions(+), 18 deletions(-) create mode 100644 milli/src/fields_ids_map/metadata.rs diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index af96f6a86..9a016e7bd 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize}; use crate::FieldId; mod global; +pub mod metadata; pub use global::GlobalFieldsIdsMap; #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/milli/src/fields_ids_map/global.rs b/milli/src/fields_ids_map/global.rs index 40d7f389b..2b948b377 100644 --- a/milli/src/fields_ids_map/global.rs +++ b/milli/src/fields_ids_map/global.rs @@ -1,14 +1,15 @@ use std::collections::BTreeMap; use std::sync::RwLock; +use super::metadata::{FieldIdMapWithMetadata, Metadata}; use super::MutFieldIdMapper; use crate::documents::FieldIdMapper; -use crate::{FieldId, FieldsIdsMap}; +use crate::FieldId; /// A fields ids map that can be globally updated to add fields #[derive(Debug, Clone)] pub struct GlobalFieldsIdsMap<'indexing> { - global: &'indexing RwLock, + global: &'indexing RwLock, local: LocalFieldsIdsMap, } @@ -16,6 +17,7 @@ pub struct GlobalFieldsIdsMap<'indexing> { pub struct LocalFieldsIdsMap { names_ids: BTreeMap, ids_names: BTreeMap, + metadata: BTreeMap, } impl FieldIdMapper for LocalFieldsIdsMap { @@ -29,14 +31,19 @@ impl FieldIdMapper for LocalFieldsIdsMap { } impl LocalFieldsIdsMap { - fn new(global: &RwLock) -> Self { + fn new(global: &RwLock) -> Self { let global = global.read().unwrap(); - Self { names_ids: global.names_ids.clone(), ids_names: global.ids_names.clone() } + Self { + names_ids: global.as_fields_ids_map().names_ids.clone(), + ids_names: global.as_fields_ids_map().ids_names.clone(), + metadata: global.iter_id_metadata().collect(), + } } - fn insert(&mut self, name: &str, field_id: FieldId) { + fn insert(&mut self, name: &str, field_id: FieldId, metadata: Metadata) { self.names_ids.insert(name.to_owned(), field_id); self.ids_names.insert(field_id, name.to_owned()); + self.metadata.insert(field_id, metadata); } fn name(&self, id: FieldId) -> Option<&str> { @@ -46,41 +53,59 @@ impl LocalFieldsIdsMap { fn id(&self, name: &str) -> Option { self.names_ids.get(name).copied() } + + fn id_with_metadata(&self, name: &str) -> Option<(FieldId, Metadata)> { + let id = self.id(name)?; + Some((id, self.metadata(id).unwrap())) + } + + fn metadata(&self, id: FieldId) -> Option { + self.metadata.get(&id).copied() + } + + fn iter(&self) -> impl Iterator { + self.ids_names.iter().map(|(k, v)| (*k, v.as_str(), self.metadata.get(k).copied().unwrap())) + } } impl<'indexing> GlobalFieldsIdsMap<'indexing> { - pub fn new(global: &'indexing RwLock) -> Self { + pub fn new(global: &'indexing RwLock) -> Self { Self { local: LocalFieldsIdsMap::new(global), global } } /// Returns the field id related to a field name, it will create a new field id if the /// name is not already known. Returns `None` if the maximum field id as been reached. pub fn id_or_insert(&mut self, name: &str) -> Option { - if let Some(field_id) = self.local.id(name) { - return Some(field_id); + self.id_with_metadata_or_insert(name).map(|(fid, _meta)| fid) + } + + pub fn id_with_metadata_or_insert(&mut self, name: &str) -> Option<(FieldId, Metadata)> { + if let Some(entry) = self.local.id_with_metadata(name) { + return Some(entry); } { // optimistically lookup the global map let global = self.global.read().unwrap(); - if let Some(field_id) = global.id(name) { - self.local.insert(name, field_id); - return Some(field_id); + if let Some((field_id, metadata)) = global.id_with_metadata(name) { + self.local.insert(name, field_id, metadata); + return Some((field_id, metadata)); } } { let mut global = self.global.write().unwrap(); - if let Some(field_id) = global.id(name) { - self.local.insert(name, field_id); - return Some(field_id); + if let Some((field_id, metadata)) = global.id_with_metadata(name) { + self.local.insert(name, field_id, metadata); + return Some((field_id, metadata)); } let field_id = global.insert(name)?; - self.local.insert(name, field_id); - Some(field_id) + let metadata = global.metadata(field_id).unwrap(); + self.local.insert(name, field_id, metadata); + Some((field_id, metadata)) } } @@ -89,8 +114,8 @@ impl<'indexing> GlobalFieldsIdsMap<'indexing> { if self.local.name(id).is_none() { let global = self.global.read().unwrap(); - let name = global.name(id)?; - self.local.insert(name, id); + let (name, metadata) = global.name_with_metadata(id)?; + self.local.insert(name, id, metadata); } self.local.name(id) diff --git a/milli/src/fields_ids_map/metadata.rs b/milli/src/fields_ids_map/metadata.rs new file mode 100644 index 000000000..8198bd415 --- /dev/null +++ b/milli/src/fields_ids_map/metadata.rs @@ -0,0 +1,184 @@ +use std::collections::{BTreeMap, HashSet}; +use std::num::NonZeroU16; + +use charabia::Language; +use heed::RoTxn; + +use super::FieldsIdsMap; +use crate::{FieldId, Index, LocalizedAttributesRule, Result}; + +#[derive(Debug, Clone, Copy)] +pub struct Metadata { + pub searchable: bool, + pub filterable: bool, + pub sortable: bool, + localized_attributes_rule_id: Option, +} + +#[derive(Debug, Clone)] +pub struct FieldIdMapWithMetadata { + fields_ids_map: FieldsIdsMap, + builder: MetadataBuilder, + metadata: BTreeMap, +} + +impl FieldIdMapWithMetadata { + pub fn new(existing_fields_ids_map: FieldsIdsMap, builder: MetadataBuilder) -> Self { + let metadata = existing_fields_ids_map + .iter() + .map(|(id, name)| (id, builder.metadata_for_field(name))) + .collect(); + Self { fields_ids_map: existing_fields_ids_map, builder, metadata } + } + + pub fn as_fields_ids_map(&self) -> &FieldsIdsMap { + &self.fields_ids_map + } + + /// Returns the number of fields ids in the map. + pub fn len(&self) -> usize { + self.fields_ids_map.len() + } + + /// Returns `true` if the map is empty. + pub fn is_empty(&self) -> bool { + self.fields_ids_map.is_empty() + } + + /// Returns the field id related to a field name, it will create a new field id if the + /// name is not already known. Returns `None` if the maximum field id as been reached. + pub fn insert(&mut self, name: &str) -> Option { + let id = self.fields_ids_map.insert(name)?; + self.metadata.insert(id, self.builder.metadata_for_field(name)); + Some(id) + } + + /// Get the id of a field based on its name. + pub fn id(&self, name: &str) -> Option { + self.fields_ids_map.id(name) + } + + pub fn id_with_metadata(&self, name: &str) -> Option<(FieldId, Metadata)> { + let id = self.fields_ids_map.id(name)?; + Some((id, self.metadata(id).unwrap())) + } + + /// Get the name of a field based on its id. + pub fn name(&self, id: FieldId) -> Option<&str> { + self.fields_ids_map.name(id) + } + + /// Get the name of a field based on its id. + pub fn name_with_metadata(&self, id: FieldId) -> Option<(&str, Metadata)> { + let name = self.fields_ids_map.name(id)?; + Some((name, self.metadata(id).unwrap())) + } + + pub fn metadata(&self, id: FieldId) -> Option { + self.metadata.get(&id).copied() + } + + /// Iterate over the ids and names in the ids order. + pub fn iter(&self) -> impl Iterator { + self.fields_ids_map.iter().map(|(id, name)| (id, name, self.metadata(id).unwrap())) + } + + pub fn iter_id_metadata(&self) -> impl Iterator + '_ { + self.metadata.iter().map(|(k, v)| (*k, *v)) + } + + pub fn iter_metadata(&self) -> impl Iterator + '_ { + self.metadata.values().copied() + } + + pub fn metadata_builder(&self) -> &MetadataBuilder { + &self.builder + } +} + +impl Metadata { + pub fn locales<'rules>( + &self, + rules: &'rules [LocalizedAttributesRule], + ) -> Option<&'rules [Language]> { + let localized_attributes_rule_id = self.localized_attributes_rule_id?.get(); + let rule = rules.get((localized_attributes_rule_id - 1) as usize).unwrap(); + Some(rule.locales()) + } +} + +#[derive(Debug, Clone)] +pub struct MetadataBuilder { + searchable_attributes: Vec, + filterable_attributes: HashSet, + sortable_attributes: HashSet, + localized_attributes: Option>, +} + +impl MetadataBuilder { + pub fn from_index(index: &Index, rtxn: &RoTxn) -> Result { + let searchable_attributes = + index.searchable_fields(rtxn)?.into_iter().map(|s| s.to_string()).collect(); + let filterable_attributes = index.filterable_fields(rtxn)?; + let sortable_attributes = index.sortable_fields(rtxn)?; + let localized_attributes = index.localized_attributes_rules(rtxn)?; + + Ok(Self { + searchable_attributes, + filterable_attributes, + sortable_attributes, + localized_attributes, + }) + } + + pub fn new( + searchable_attributes: Vec, + filterable_attributes: HashSet, + sortable_attributes: HashSet, + localized_attributes: Option>, + ) -> Self { + Self { + searchable_attributes, + filterable_attributes, + sortable_attributes, + localized_attributes, + } + } + + pub fn metadata_for_field(&self, field: &str) -> Metadata { + let searchable = self + .searchable_attributes + .iter() + .any(|attribute| attribute == "*" || attribute == field); + + let filterable = self.filterable_attributes.contains(field); + + let sortable = self.sortable_attributes.contains(field); + + let localized_attributes_rule_id = self + .localized_attributes + .iter() + .map(|v| v.iter()) + .flatten() + .position(|rule| rule.match_str(field)) + .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); + + Metadata { searchable, filterable, sortable, localized_attributes_rule_id } + } + + pub fn searchable_attributes(&self) -> &[String] { + self.searchable_attributes.as_slice() + } + + pub fn sortable_attributes(&self) -> &HashSet { + &self.sortable_attributes + } + + pub fn filterable_attributes(&self) -> &HashSet { + &self.filterable_attributes + } + + pub fn localized_attributes_rules(&self) -> Option<&[LocalizedAttributesRule]> { + self.localized_attributes.as_deref() + } +} From c8189e975c8c7118f910f53ce679e61a4e80304c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:10:55 +0100 Subject: [PATCH 168/247] Add rendering based on document trait --- milli/src/prompt/context.rs | 20 +- milli/src/prompt/document.rs | 523 ++++++++++++++++++++++++++++++++++- milli/src/prompt/fields.rs | 192 +++++++++++-- milli/src/prompt/mod.rs | 35 ++- 4 files changed, 728 insertions(+), 42 deletions(-) diff --git a/milli/src/prompt/context.rs b/milli/src/prompt/context.rs index 7ab08301a..02258d067 100644 --- a/milli/src/prompt/context.rs +++ b/milli/src/prompt/context.rs @@ -3,23 +3,19 @@ use liquid::model::{ }; use liquid::{ObjectView, ValueView}; -use super::document::Document; -use super::fields::Fields; -use super::FieldsIdsMapWithMetadata; - #[derive(Debug, Clone)] -pub struct Context<'a> { - document: &'a Document<'a>, - fields: Fields<'a>, +pub struct Context<'a, D: ObjectView, F: ArrayView> { + document: &'a D, + fields: &'a F, } -impl<'a> Context<'a> { - pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self { - Self { document, fields: Fields::new(document, field_id_map) } +impl<'a, D: ObjectView, F: ArrayView> Context<'a, D, F> { + pub fn new(document: &'a D, fields: &'a F) -> Self { + Self { document, fields } } } -impl<'a> ObjectView for Context<'a> { +impl<'a, D: ObjectView, F: ArrayView> ObjectView for Context<'a, D, F> { fn as_value(&self) -> &dyn ValueView { self } @@ -56,7 +52,7 @@ impl<'a> ObjectView for Context<'a> { } } -impl<'a> ValueView for Context<'a> { +impl<'a, D: ObjectView, F: ArrayView> ValueView for Context<'a, D, F> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } diff --git a/milli/src/prompt/document.rs b/milli/src/prompt/document.rs index a809f58ce..d6aadce65 100644 --- a/milli/src/prompt/document.rs +++ b/milli/src/prompt/document.rs @@ -1,10 +1,15 @@ use std::cell::OnceCell; use std::collections::BTreeMap; +use std::fmt::{self, Debug}; +use bumpalo::Bump; use liquid::model::{ - DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue, + ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State, + Value as LiquidValue, }; use liquid::{ObjectView, ValueView}; +use raw_collections::{RawMap, RawVec}; +use serde_json::value::RawValue; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::FieldsIdsMap; @@ -93,7 +98,7 @@ impl<'a> ObjectView for Document<'a> { } impl<'a> ValueView for Document<'a> { - fn as_debug(&self) -> &dyn std::fmt::Debug { + fn as_debug(&self) -> &dyn Debug { self } @@ -128,4 +133,518 @@ impl<'a> ValueView for Document<'a> { fn as_object(&self) -> Option<&dyn ObjectView> { Some(self) } + + fn is_object(&self) -> bool { + true + } +} + +/// Implementation for any type that implements the Document trait +use crate::update::new::document::Document as DocumentTrait; + +#[derive(Debug)] +pub struct ParseableDocument<'doc, D> { + document: D, + doc_alloc: &'doc Bump, +} + +impl<'doc, D> ParseableDocument<'doc, D> { + pub fn new(document: D, doc_alloc: &'doc Bump) -> Self { + Self { document, doc_alloc } + } +} + +impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc, D> { + fn as_value(&self) -> &dyn ValueView { + self + } + + fn size(&self) -> i64 { + self.document.len() as i64 + } + + fn keys<'k>(&'k self) -> Box> + 'k> { + Box::new(self.document.iter_top_level_fields().map(|res| { + let (field, _) = res.unwrap(); + KStringCow::from_ref(field) + })) + } + + fn values<'k>(&'k self) -> Box + 'k> { + Box::new(self.document.iter_top_level_fields().map(|res| { + let (_, value) = res.unwrap(); + ParseableValue::new_bump(value, self.doc_alloc) as _ + })) + } + + fn iter<'k>(&'k self) -> Box, &'k dyn ValueView)> + 'k> { + Box::new(self.document.iter_top_level_fields().map(|res| { + let (field, value) = res.unwrap(); + (KStringCow::from_ref(field), ParseableValue::new_bump(value, self.doc_alloc) as _) + })) + } + + fn contains_key(&self, index: &str) -> bool { + self.document.top_level_field(index).unwrap().is_some() + } + + fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> { + let s = self.document.top_level_field(index).unwrap()?; + Some(ParseableValue::new_bump(s, self.doc_alloc)) + } +} + +impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> { + fn as_debug(&self) -> &dyn fmt::Debug { + self + } + fn render(&self) -> liquid::model::DisplayCow<'_> { + DisplayCow::Owned(Box::new(ObjectRender::new(self))) + } + + fn source(&self) -> liquid::model::DisplayCow<'_> { + DisplayCow::Owned(Box::new(ObjectSource::new(self))) + } + + fn type_name(&self) -> &'static str { + "object" + } + + fn query_state(&self, state: liquid::model::State) -> bool { + match state { + State::Truthy => true, + State::DefaultValue | State::Empty | State::Blank => false, + } + } + + fn to_kstr(&self) -> liquid::model::KStringCow<'_> { + let s = ObjectRender::new(self).to_string(); + KStringCow::from_string(s) + } + + fn to_value(&self) -> LiquidValue { + LiquidValue::Object( + self.document + .iter_top_level_fields() + .map(|res| { + let (k, v) = res.unwrap(); + (k.to_string().into(), ParseableValue::new(v, self.doc_alloc).to_value()) + }) + .collect(), + ) + } + + fn as_object(&self) -> Option<&dyn ObjectView> { + Some(self) + } + + fn is_object(&self) -> bool { + true + } +} + +#[derive(Debug)] +struct ParseableValue<'doc> { + value: raw_collections::Value<'doc>, +} + +impl<'doc> ParseableValue<'doc> { + pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self { + let value = raw_collections::Value::from_raw_value(value, doc_alloc).unwrap(); + Self { value } + } + + pub fn new_bump(value: &'doc RawValue, doc_alloc: &'doc Bump) -> &'doc Self { + doc_alloc.alloc(Self::new(value, doc_alloc)) + } +} + +// transparent newtype for implementing ValueView +#[repr(transparent)] +#[derive(Debug)] +struct ParseableMap<'doc>(RawMap<'doc>); + +// transparent newtype for implementing ValueView +#[repr(transparent)] +#[derive(Debug)] +struct ParseableArray<'doc>(RawVec<'doc>); + +impl<'doc> ParseableMap<'doc> { + pub fn as_parseable<'a>(map: &'a RawMap<'doc>) -> &'a ParseableMap<'doc> { + // SAFETY: repr(transparent) + unsafe { &*(map as *const RawMap as *const Self) } + } +} + +impl<'doc> ParseableArray<'doc> { + pub fn as_parseable<'a>(array: &'a RawVec<'doc>) -> &'a ParseableArray<'doc> { + // SAFETY: repr(transparent) + unsafe { &*(array as *const RawVec as *const Self) } + } +} + +impl<'doc> ArrayView for ParseableArray<'doc> { + fn as_value(&self) -> &dyn ValueView { + self + } + + fn size(&self) -> i64 { + self.0.len() as _ + } + + fn values<'k>(&'k self) -> Box + 'k> { + Box::new(self.0.iter().map(|v| ParseableValue::new_bump(v, self.0.bump()) as _)) + } + + fn contains_key(&self, index: i64) -> bool { + let index = convert_index(index, self.size()); + index < self.size() && index >= 0 + } + + fn get(&self, index: i64) -> Option<&dyn ValueView> { + let index = convert_index(index, self.size()); + if index <= 0 { + return None; + } + let v = self.0.get(index as usize)?; + Some(ParseableValue::new_bump(v, self.0.bump())) + } +} + +impl<'doc> ValueView for ParseableArray<'doc> { + fn as_debug(&self) -> &dyn std::fmt::Debug { + self + } + + fn render(&self) -> DisplayCow<'_> { + DisplayCow::Owned(Box::new(ArrayRender { s: &self.0 })) + } + + fn source(&self) -> DisplayCow<'_> { + DisplayCow::Owned(Box::new(ArraySource { s: &self.0 })) + } + + fn type_name(&self) -> &'static str { + "array" + } + + fn query_state(&self, state: State) -> bool { + match state { + State::Truthy => true, + State::DefaultValue | State::Empty | State::Blank => self.0.is_empty(), + } + } + + fn to_kstr(&self) -> KStringCow<'_> { + let s = ArrayRender { s: &self.0 }.to_string(); + KStringCow::from_string(s) + } + + fn to_value(&self) -> LiquidValue { + LiquidValue::Array(self.values().map(|v| v.to_value()).collect()) + } + + fn is_array(&self) -> bool { + true + } + + fn as_array(&self) -> Option<&dyn ArrayView> { + Some(self as _) + } +} + +impl<'doc> ObjectView for ParseableMap<'doc> { + fn as_value(&self) -> &dyn ValueView { + self + } + + fn size(&self) -> i64 { + self.0.len() as i64 + } + + fn keys<'k>(&'k self) -> Box> + 'k> { + Box::new(self.0.keys().map(Into::into)) + } + + fn values<'k>(&'k self) -> Box + 'k> { + Box::new(self.0.values().map(|value| { + let doc_alloc = self.0.bump(); + ParseableValue::new_bump(value, doc_alloc) as _ + })) + } + + fn iter<'k>(&'k self) -> Box, &'k dyn ValueView)> + 'k> { + Box::new(self.0.iter().map(|(k, v)| { + let doc_alloc = self.0.bump(); + (k.into(), ParseableValue::new_bump(v, doc_alloc) as _) + })) + } + + fn contains_key(&self, index: &str) -> bool { + self.0.get(index).is_some() + } + + fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> { + let v = self.0.get(index)?; + let doc_alloc = self.0.bump(); + let value = ParseableValue::new(v, doc_alloc); + Some(doc_alloc.alloc(value) as _) + } +} + +impl<'doc> ValueView for ParseableMap<'doc> { + fn as_debug(&self) -> &dyn std::fmt::Debug { + self + } + + fn render(&self) -> liquid::model::DisplayCow<'_> { + DisplayCow::Owned(Box::new(ObjectRender::new(self))) + } + + fn source(&self) -> liquid::model::DisplayCow<'_> { + DisplayCow::Owned(Box::new(ObjectSource::new(self))) + } + + fn type_name(&self) -> &'static str { + "object" + } + + fn query_state(&self, state: liquid::model::State) -> bool { + match state { + State::Truthy => true, + State::DefaultValue | State::Empty | State::Blank => self.0.is_empty(), + } + } + + fn to_kstr(&self) -> liquid::model::KStringCow<'_> { + let s = ObjectRender::new(self).to_string(); + KStringCow::from_string(s) + } + + fn to_value(&self) -> LiquidValue { + LiquidValue::Object( + self.0 + .iter() + .map(|(k, v)| { + (k.to_string().into(), ParseableValue::new(v, self.0.bump()).to_value()) + }) + .collect(), + ) + } + + fn as_object(&self) -> Option<&dyn ObjectView> { + Some(self) + } + + fn is_object(&self) -> bool { + true + } +} + +impl<'doc> ValueView for ParseableValue<'doc> { + fn as_debug(&self) -> &dyn Debug { + self + } + + fn render(&self) -> DisplayCow<'_> { + use raw_collections::value::Number; + use raw_collections::Value; + match &self.value { + Value::Null => LiquidValue::Nil.render(), + Value::Bool(v) => v.render(), + Value::Number(number) => match number { + Number::PosInt(x) => DisplayCow::Borrowed(x), + Number::NegInt(x) => x.render(), + Number::Finite(x) => x.render(), + }, + Value::String(s) => s.render(), + Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).render(), + Value::Object(raw_map) => ParseableMap::as_parseable(raw_map).render(), + } + } + + fn source(&self) -> DisplayCow<'_> { + use raw_collections::value::Number; + use raw_collections::Value; + match &self.value { + Value::Null => LiquidValue::Nil.source(), + Value::Bool(v) => ValueView::source(v), + Value::Number(number) => match number { + Number::PosInt(x) => DisplayCow::Borrowed(x), + Number::NegInt(x) => x.source(), + Number::Finite(x) => x.source(), + }, + Value::String(s) => s.source(), + Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).source(), + Value::Object(raw_map) => ParseableMap::as_parseable(raw_map).source(), + } + } + + fn type_name(&self) -> &'static str { + use raw_collections::value::Number; + use raw_collections::Value; + match &self.value { + Value::Null => LiquidValue::Nil.type_name(), + Value::Bool(v) => v.type_name(), + Value::Number(number) => match number { + Number::PosInt(_x) => "whole positive number", + Number::NegInt(x) => x.type_name(), + Number::Finite(x) => x.type_name(), + }, + Value::String(s) => s.type_name(), + Value::Array(_raw_vec) => "array", + Value::Object(_raw_map) => "object", + } + } + + fn query_state(&self, state: State) -> bool { + use raw_collections::Value; + match &self.value { + Value::Null => ValueView::query_state(&LiquidValue::Nil, state), + Value::Bool(v) => ValueView::query_state(v, state), + Value::Number(_number) => match state { + State::Truthy => true, + State::DefaultValue => false, + State::Empty => false, + State::Blank => false, + }, + Value::String(s) => ValueView::query_state(s, state), + Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).query_state(state), + Value::Object(raw_map) => ParseableMap::as_parseable(raw_map).query_state(state), + } + } + + fn to_kstr(&self) -> KStringCow<'_> { + use raw_collections::Value; + match &self.value { + Value::Null => ValueView::to_kstr(&LiquidValue::Nil), + Value::Bool(v) => ValueView::to_kstr(v), + Value::Number(_number) => self.render().to_string().into(), + Value::String(s) => KStringCow::from_ref(*s), + Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_kstr(), + Value::Object(raw_map) => ParseableMap::as_parseable(raw_map).to_kstr(), + } + } + + fn to_value(&self) -> LiquidValue { + use raw_collections::Value; + match &self.value { + Value::Null => LiquidValue::Nil, + Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)), + Value::Number(number) => match number { + raw_collections::value::Number::PosInt(number) => { + let number: i64 = match (*number).try_into() { + Ok(number) => number, + Err(_) => { + return LiquidValue::Scalar(ScalarCow::new(self.render().to_string())) + } + }; + LiquidValue::Scalar(ScalarCow::new(number)) + } + raw_collections::value::Number::NegInt(number) => { + LiquidValue::Scalar(ScalarCow::new(*number)) + } + raw_collections::value::Number::Finite(number) => { + LiquidValue::Scalar(ScalarCow::new(*number)) + } + }, + Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())), + Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(), + Value::Object(raw_map) => ParseableMap::as_parseable(raw_map).to_value(), + } + } + + fn as_scalar(&self) -> Option> { + use raw_collections::value::Number; + use raw_collections::Value; + match &self.value { + Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)), + Value::Number(number) => match number { + Number::PosInt(number) => { + let number: i64 = match (*number).try_into() { + Ok(number) => number, + Err(_) => return Some(ScalarCow::new(self.render().to_string())), + }; + Some(ScalarCow::new(number)) + } + Number::NegInt(number) => Some(ScalarCow::new(*number)), + Number::Finite(number) => Some(ScalarCow::new(*number)), + }, + Value::String(s) => Some(ScalarCow::new(*s)), + _ => None, + } + } + + fn is_scalar(&self) -> bool { + use raw_collections::Value; + match &self.value { + Value::Bool(_) | Value::Number(_) | Value::String(_) => true, + _ => false, + } + } + + fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> { + if let raw_collections::Value::Array(array) = &self.value { + return Some(ParseableArray::as_parseable(array) as _); + } + None + } + + fn is_array(&self) -> bool { + matches!(&self.value, raw_collections::Value::Array(_)) + } + + fn as_object(&self) -> Option<&dyn ObjectView> { + if let raw_collections::Value::Object(object) = &self.value { + return Some(ParseableMap::as_parseable(object) as _); + } + None + } + + fn is_object(&self) -> bool { + matches!(&self.value, raw_collections::Value::Object(_)) + } + + fn is_nil(&self) -> bool { + matches!(&self.value, raw_collections::Value::Null) + } +} + +struct ArraySource<'s, 'doc> { + s: &'s RawVec<'doc>, +} + +impl<'s, 'doc> fmt::Display for ArraySource<'s, 'doc> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "[")?; + for item in self.s { + let v = ParseableValue::new(item, self.s.bump()); + write!(f, "{}, ", v.render())?; + } + write!(f, "]")?; + Ok(()) + } +} + +struct ArrayRender<'s, 'doc> { + s: &'s RawVec<'doc>, +} + +impl<'s, 'doc> fmt::Display for ArrayRender<'s, 'doc> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for item in self.s { + let v = ParseableValue::new(item, self.s.bump()); + + write!(f, "{}", v.render())?; + } + Ok(()) + } +} + +fn convert_index(index: i64, max_size: i64) -> i64 { + if 0 <= index { + index + } else { + max_size + index + } } diff --git a/milli/src/prompt/fields.rs b/milli/src/prompt/fields.rs index 81ea88ca6..b5d86b348 100644 --- a/milli/src/prompt/fields.rs +++ b/milli/src/prompt/fields.rs @@ -1,36 +1,23 @@ +use std::cell::RefCell; +use std::fmt; + +use bumpalo::Bump; use liquid::model::{ ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue, }; use liquid::{ObjectView, ValueView}; -use super::document::Document; use super::{FieldMetadata, FieldsIdsMapWithMetadata}; -#[derive(Debug, Clone)] -pub struct Fields<'a>(Vec>); - -impl<'a> Fields<'a> { - pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self { - Self( - std::iter::repeat(document) - .zip(field_id_map.iter()) - .map(|(document, (fid, name))| FieldValue { - document, - name, - metadata: field_id_map.metadata(fid).unwrap_or_default(), - }) - .collect(), - ) - } -} +use crate::GlobalFieldsIdsMap; #[derive(Debug, Clone, Copy)] -pub struct FieldValue<'a> { +pub struct FieldValue<'a, D: ObjectView> { name: &'a str, - document: &'a Document<'a>, + document: &'a D, metadata: FieldMetadata, } -impl<'a> ValueView for FieldValue<'a> { +impl<'a, D: ObjectView> ValueView for FieldValue<'a, D> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } @@ -70,7 +57,7 @@ impl<'a> ValueView for FieldValue<'a> { } } -impl<'a> FieldValue<'a> { +impl<'a, D: ObjectView> FieldValue<'a, D> { pub fn name(&self) -> &&'a str { &self.name } @@ -88,7 +75,7 @@ impl<'a> FieldValue<'a> { } } -impl<'a> ObjectView for FieldValue<'a> { +impl<'a, D: ObjectView> ObjectView for FieldValue<'a, D> { fn as_value(&self) -> &dyn ValueView { self } @@ -127,7 +114,42 @@ impl<'a> ObjectView for FieldValue<'a> { } } -impl<'a> ArrayView for Fields<'a> { +#[derive(Debug, Clone)] +pub struct OwnedFields<'a, D: ObjectView>(Vec>); + +#[derive(Debug)] +pub struct BorrowedFields<'a, 'map, D: ObjectView> { + document: &'a D, + field_id_map: &'a RefCell>, + doc_alloc: &'a Bump, +} + +impl<'a, D: ObjectView> OwnedFields<'a, D> { + pub fn new(document: &'a D, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self { + Self( + std::iter::repeat(document) + .zip(field_id_map.iter()) + .map(|(document, (fid, name))| FieldValue { + document, + name, + metadata: field_id_map.metadata(fid).unwrap_or_default(), + }) + .collect(), + ) + } +} + +impl<'a, 'map, D: ObjectView> BorrowedFields<'a, 'map, D> { + pub fn new( + document: &'a D, + field_id_map: &'a RefCell>, + doc_alloc: &'a Bump, + ) -> Self { + Self { document, field_id_map, doc_alloc } + } +} + +impl<'a, D: ObjectView> ArrayView for OwnedFields<'a, D> { fn as_value(&self) -> &dyn ValueView { self.0.as_value() } @@ -149,7 +171,91 @@ impl<'a> ArrayView for Fields<'a> { } } -impl<'a> ValueView for Fields<'a> { +impl<'a, 'map, D: ObjectView> ArrayView for BorrowedFields<'a, 'map, D> { + fn as_value(&self) -> &dyn ValueView { + self + } + + fn size(&self) -> i64 { + self.document.size() + } + + fn values<'k>(&'k self) -> Box + 'k> { + Box::new(self.document.keys().map(|k| { + let mut field_id_map = self.field_id_map.borrow_mut(); + let (_, metadata) = field_id_map.id_with_metadata_or_insert(&k).unwrap(); + let fv = self.doc_alloc.alloc(FieldValue { + name: self.doc_alloc.alloc_str(&k), + document: self.document, + metadata: FieldMetadata { searchable: metadata.searchable }, + }); + fv as _ + })) + } + + fn contains_key(&self, index: i64) -> bool { + let index = if index >= 0 { index } else { self.size() + index }; + index >= 0 && index < self.size() + } + + fn get(&self, index: i64) -> Option<&dyn ValueView> { + let index = if index >= 0 { index } else { self.size() + index }; + let index: usize = index.try_into().ok()?; + let key = self.document.keys().nth(index)?; + let mut field_id_map = self.field_id_map.borrow_mut(); + let (_, metadata) = field_id_map.id_with_metadata_or_insert(&key)?; + let fv = self.doc_alloc.alloc(FieldValue { + name: self.doc_alloc.alloc_str(&key), + document: self.document, + metadata: FieldMetadata { searchable: metadata.searchable }, + }); + Some(fv as _) + } +} + +impl<'a, 'map, D: ObjectView> ValueView for BorrowedFields<'a, 'map, D> { + fn as_debug(&self) -> &dyn std::fmt::Debug { + self + } + + fn render(&self) -> liquid::model::DisplayCow<'_> { + DisplayCow::Owned(Box::new(ArrayRender { s: self })) + } + + fn source(&self) -> liquid::model::DisplayCow<'_> { + DisplayCow::Owned(Box::new(ArraySource { s: self })) + } + + fn type_name(&self) -> &'static str { + "array" + } + + fn query_state(&self, state: liquid::model::State) -> bool { + match state { + State::Truthy => true, + State::DefaultValue | State::Empty | State::Blank => self.document.size() == 0, + } + } + + fn to_kstr(&self) -> liquid::model::KStringCow<'_> { + let s = ArrayRender { s: self }.to_string(); + KStringCow::from_string(s) + } + + fn to_value(&self) -> LiquidValue { + LiquidValue::Array(self.values().map(|v| v.to_value()).collect()) + } + + fn as_array(&self) -> Option<&dyn ArrayView> { + Some(self) + } + + fn is_array(&self) -> bool { + true + } +} + +impl<'a, D: ObjectView> ValueView for OwnedFields<'a, D> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } @@ -182,3 +288,39 @@ impl<'a> ValueView for Fields<'a> { Some(self) } } + +struct ArraySource<'a, 'map, D: ObjectView> { + s: &'a BorrowedFields<'a, 'map, D>, +} + +impl<'a, 'map, D: ObjectView> fmt::Display for ArraySource<'a, 'map, D> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "[")?; + for item in self.s.values() { + write!(f, "{}, ", item.render())?; + } + write!(f, "]")?; + Ok(()) + } +} + +struct ArrayRender<'a, 'map, D: ObjectView> { + s: &'a BorrowedFields<'a, 'map, D>, +} + +impl<'a, 'map, D: ObjectView> fmt::Display for ArrayRender<'a, 'map, D> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for item in self.s.values() { + write!(f, "{}", item.render())?; + } + Ok(()) + } +} + +fn convert_index(index: i64, max_size: i64) -> i64 { + if 0 <= index { + index + } else { + max_size + index + } +} diff --git a/milli/src/prompt/mod.rs b/milli/src/prompt/mod.rs index 0a076f4f9..fd843cd3c 100644 --- a/milli/src/prompt/mod.rs +++ b/milli/src/prompt/mod.rs @@ -4,17 +4,22 @@ pub(crate) mod error; mod fields; mod template_checker; +use std::cell::RefCell; use std::collections::BTreeMap; use std::convert::TryFrom; +use std::fmt::Debug; use std::num::NonZeroUsize; use std::ops::Deref; +use bumpalo::Bump; +use document::ParseableDocument; use error::{NewPromptError, RenderPromptError}; +use fields::{BorrowedFields, OwnedFields}; use self::context::Context; use self::document::Document; use crate::update::del_add::DelAdd; -use crate::{FieldId, FieldsIdsMap}; +use crate::{FieldId, FieldsIdsMap, GlobalFieldsIdsMap}; pub struct Prompt { template: liquid::Template, @@ -109,14 +114,38 @@ impl Prompt { Ok(this) } - pub fn render( + pub fn render_document< + 'a, // lifetime of the borrow of the document + 'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents + >( + &self, + document: impl crate::update::new::document::Document<'a> + Debug, + field_id_map: &RefCell, + doc_alloc: &'doc Bump, + ) -> Result<&'doc str, RenderPromptError> { + let document = ParseableDocument::new(document, doc_alloc); + let fields = BorrowedFields::new(&document, field_id_map, doc_alloc); + let context = Context::new(&document, &fields); + let mut rendered = bumpalo::collections::Vec::with_capacity_in( + self.max_bytes.unwrap_or_else(default_max_bytes).get(), + &doc_alloc, + ); + self.template + .render_to(&mut rendered, &context) + .map_err(RenderPromptError::missing_context)?; + Ok(std::str::from_utf8(rendered.into_bump_slice()) + .expect("render can only write UTF-8 because all inputs and processing preserve utf-8")) + } + + pub fn render_kvdeladd( &self, document: &obkv::KvReaderU16, side: DelAdd, field_id_map: &FieldsIdsMapWithMetadata, ) -> Result { let document = Document::new(document, side, field_id_map); - let context = Context::new(&document, field_id_map); + let fields = OwnedFields::new(&document, field_id_map); + let context = Context::new(&document, &fields); let mut rendered = self.template.render(&context).map_err(RenderPromptError::missing_context)?; From 663deac236a5e802fad6c6e65055dbca041b65b5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:21:39 +0100 Subject: [PATCH 169/247] Slight changes index scheduler --- index-scheduler/src/batch.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 14bbcfe53..fdf213a6b 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -33,13 +33,11 @@ use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::new::indexer::{ self, retrieve_or_guess_primary_key, UpdateByFunction, }; -use meilisearch_types::milli::update::{ - IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, -}; +use meilisearch_types::milli::update::{IndexDocumentsMethod, Settings as MilliSettings}; use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; -use meilisearch_types::milli::{self, Filter, Object}; +use meilisearch_types::milli::{self, Filter}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; @@ -50,7 +48,7 @@ use uuid::Uuid; use crate::autobatcher::{self, BatchKind}; use crate::utils::{self, swap_index_uid_in_task}; -use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId}; +use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId}; /// Represents a combination of tasks that can all be processed at the same time. /// From 2a9184966076e7b9c16e185273f41c7b18a088b2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:21:50 +0100 Subject: [PATCH 170/247] Remove primary key from top id map --- milli/src/documents/primary_key.rs | 44 ++---------------------------- 1 file changed, 2 insertions(+), 42 deletions(-) diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs index 79fd07048..1831a2dbc 100644 --- a/milli/src/documents/primary_key.rs +++ b/milli/src/documents/primary_key.rs @@ -1,15 +1,14 @@ -use std::borrow::Cow; use std::iter; use std::ops::ControlFlow; use std::result::Result as StdResult; use bumpalo::Bump; use serde_json::value::RawValue; -use serde_json::{from_str, Value}; +use serde_json::Value; use crate::fields_ids_map::MutFieldIdMapper; use crate::update::new::indexer::de::{match_component, DeOrBumpStr}; -use crate::update::new::{CowStr, KvReaderFieldId, TopLevelMap}; +use crate::update::new::KvReaderFieldId; use crate::{FieldId, InternalError, Object, Result, UserError}; /// The symbol used to define levels in a nested primary key. @@ -230,45 +229,6 @@ impl<'a> PrimaryKey<'a> { Ok(external_document_id) } - /// Returns the document ID based on the primary and - /// search for it recursively in zero-copy-deserialized documents. - pub fn document_id_from_top_level_map<'p>( - &self, - document: &TopLevelMap<'p>, - ) -> Result, DocumentIdExtractionError>> { - fn get_docid<'p>( - document: &TopLevelMap<'p>, - primary_key: &[&str], - ) -> Result, DocumentIdExtractionError>> { - match primary_key { - [] => unreachable!("arrrgh"), // would None be ok? - [primary_key] => match document.0.get(*primary_key) { - Some(value) => match from_str::(value.get()) { - Ok(value) => Ok(Ok(CowStr(Cow::Owned(value.to_string())))), - Err(_) => match from_str(value.get()) { - Ok(document_id) => Ok(Ok(document_id)), - Err(e) => Ok(Err(DocumentIdExtractionError::InvalidDocumentId( - UserError::SerdeJson(e), - ))), - }, - }, - None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), - }, - [head, tail @ ..] => match document.0.get(*head) { - Some(value) => { - let document = from_str(value.get()).map_err(InternalError::SerdeJson)?; - get_docid(&document, tail) - } - None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), - }, - } - } - - /// TODO do not allocate a vec everytime here - let primary_key: Vec<_> = self.name().split(PRIMARY_KEY_SPLIT_SYMBOL).collect(); - get_docid(document, &primary_key) - } - /// Returns an `Iterator` that gives all the possible fields names the primary key /// can have depending of the first level name and depth of the objects. pub fn possible_level_names(&self) -> impl Iterator + '_ { From 1960003805bfe712c24b16d7595cdb2dd425169f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:22:19 +0100 Subject: [PATCH 171/247] Remove some warnings --- milli/src/fields_ids_map/global.rs | 4 ---- milli/src/fields_ids_map/metadata.rs | 3 +-- milli/src/prompt/document.rs | 7 ++----- milli/src/prompt/fields.rs | 8 -------- milli/src/prompt/mod.rs | 2 +- 5 files changed, 4 insertions(+), 20 deletions(-) diff --git a/milli/src/fields_ids_map/global.rs b/milli/src/fields_ids_map/global.rs index 2b948b377..32aefbfdf 100644 --- a/milli/src/fields_ids_map/global.rs +++ b/milli/src/fields_ids_map/global.rs @@ -62,10 +62,6 @@ impl LocalFieldsIdsMap { fn metadata(&self, id: FieldId) -> Option { self.metadata.get(&id).copied() } - - fn iter(&self) -> impl Iterator { - self.ids_names.iter().map(|(k, v)| (*k, v.as_str(), self.metadata.get(k).copied().unwrap())) - } } impl<'indexing> GlobalFieldsIdsMap<'indexing> { diff --git a/milli/src/fields_ids_map/metadata.rs b/milli/src/fields_ids_map/metadata.rs index 8198bd415..54fdc7b4b 100644 --- a/milli/src/fields_ids_map/metadata.rs +++ b/milli/src/fields_ids_map/metadata.rs @@ -158,8 +158,7 @@ impl MetadataBuilder { let localized_attributes_rule_id = self .localized_attributes .iter() - .map(|v| v.iter()) - .flatten() + .flat_map(|v| v.iter()) .position(|rule| rule.match_str(field)) .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); diff --git a/milli/src/prompt/document.rs b/milli/src/prompt/document.rs index d6aadce65..28c0f47af 100644 --- a/milli/src/prompt/document.rs +++ b/milli/src/prompt/document.rs @@ -520,7 +520,7 @@ impl<'doc> ValueView for ParseableValue<'doc> { Value::Null => ValueView::to_kstr(&LiquidValue::Nil), Value::Bool(v) => ValueView::to_kstr(v), Value::Number(_number) => self.render().to_string().into(), - Value::String(s) => KStringCow::from_ref(*s), + Value::String(s) => KStringCow::from_ref(s), Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_kstr(), Value::Object(raw_map) => ParseableMap::as_parseable(raw_map).to_kstr(), } @@ -577,10 +577,7 @@ impl<'doc> ValueView for ParseableValue<'doc> { fn is_scalar(&self) -> bool { use raw_collections::Value; - match &self.value { - Value::Bool(_) | Value::Number(_) | Value::String(_) => true, - _ => false, - } + matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_)) } fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> { diff --git a/milli/src/prompt/fields.rs b/milli/src/prompt/fields.rs index b5d86b348..ab15c31b0 100644 --- a/milli/src/prompt/fields.rs +++ b/milli/src/prompt/fields.rs @@ -316,11 +316,3 @@ impl<'a, 'map, D: ObjectView> fmt::Display for ArrayRender<'a, 'map, D> { Ok(()) } } - -fn convert_index(index: i64, max_size: i64) -> i64 { - if 0 <= index { - index - } else { - max_size + index - } -} diff --git a/milli/src/prompt/mod.rs b/milli/src/prompt/mod.rs index fd843cd3c..bbcf054e6 100644 --- a/milli/src/prompt/mod.rs +++ b/milli/src/prompt/mod.rs @@ -128,7 +128,7 @@ impl Prompt { let context = Context::new(&document, &fields); let mut rendered = bumpalo::collections::Vec::with_capacity_in( self.max_bytes.unwrap_or_else(default_max_bytes).get(), - &doc_alloc, + doc_alloc, ); self.template .render_to(&mut rendered, &context) From af9f96e2af861b83f798d0654bd9bd27e9b700dd Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:22:45 +0100 Subject: [PATCH 172/247] Update older embedding --- .../extract/extract_vector_points.rs | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 03843fcd8..7b5bf3f40 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -21,7 +21,7 @@ use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME}; use crate::vector::settings::ReindexAction; -use crate::vector::{Embedder, Embeddings}; +use crate::vector::{Embedder, Embedding}; use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. @@ -536,9 +536,11 @@ fn extract_vector_document_diff( } // Don't give up if the old prompt was failing let old_prompt = Some(&prompt).map(|p| { - p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() + p.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) + .unwrap_or_default() }); - let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + let new_prompt = + prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; if old_prompt.as_ref() != Some(&new_prompt) { let old_prompt = old_prompt.unwrap_or_default(); tracing::trace!( @@ -570,7 +572,7 @@ fn extract_vector_document_diff( return Ok(VectorStateDelta::NoChange); } // becomes autogenerated - VectorStateDelta::NowGenerated(prompt.render( + VectorStateDelta::NowGenerated(prompt.render_kvdeladd( obkv, DelAdd::Addition, new_fields_ids_map, @@ -613,9 +615,10 @@ fn regenerate_if_prompt_changed( &FieldsIdsMapWithMetadata, ), ) -> Result { - let old_prompt = - old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default()); - let new_prompt = new_prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + let old_prompt = old_prompt + .render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) + .unwrap_or(Default::default()); + let new_prompt = new_prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; if new_prompt == old_prompt { return Ok(VectorStateDelta::NoChange); @@ -628,7 +631,7 @@ fn regenerate_prompt( prompt: &Prompt, new_fields_ids_map: &FieldsIdsMapWithMetadata, ) -> Result { - let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; Ok(VectorStateDelta::NowGenerated(prompt)) } @@ -738,7 +741,7 @@ pub fn extract_embeddings( .flat_map(|docids| docids.iter()) .zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter())) { - state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?; + state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?; } chunks_ids.clear(); } @@ -759,7 +762,7 @@ pub fn extract_embeddings( .flat_map(|docids| docids.iter()) .zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter())) { - state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?; + state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?; } } @@ -775,7 +778,7 @@ pub fn extract_embeddings( if let Some(embeds) = embeds.first() { for (docid, embeddings) in current_chunk_ids.iter().zip(embeds.iter()) { - state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?; + state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?; } } } @@ -790,7 +793,7 @@ fn embed_chunks( possible_embedding_mistakes: &PossibleEmbeddingMistakes, unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, -) -> Result>>> { +) -> Result>> { match embedder.embed_chunks(text_chunks, request_threads) { Ok(chunks) => Ok(chunks), Err(error) => { From bbb67ae0a8791d68c1c035c22832eb9eb7d2e295 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:23:02 +0100 Subject: [PATCH 173/247] todo channel --- milli/src/update/new/channel.rs | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 5c206b5ba..657c00141 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -12,7 +12,7 @@ use super::StdResult; use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; use crate::update::new::KvReaderFieldId; use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::vector::Embeddings; +use crate::vector::Embedding; use crate::{DocumentId, Index}; /// The capacity of the channel is currently in number of messages. @@ -493,19 +493,38 @@ impl DocumentsSender<'_> { pub struct EmbeddingSender<'a>(Option<&'a Sender>); impl EmbeddingSender<'_> { - pub fn delete_embeddings(docid: DocumentId, embedder_id: u8) -> StdResult<(), SendError<()>> { + pub fn delete(&self, docid: DocumentId, embedder_id: u8) -> StdResult<(), SendError<()>> { todo!() } - pub fn set_embeddings( + pub fn set_vectors( + &self, docid: DocumentId, embedder_id: u8, - embeddings: Embeddings, + embeddings: Vec, ) -> StdResult<(), SendError<()>> { todo!() } - pub fn finish_embedder(embedder_id: u8) {} + pub fn set_vector( + &self, + docid: DocumentId, + embedder_id: u8, + embedding: Embedding, + ) -> StdResult<(), SendError<()>> { + todo!() + } + + pub fn set_user_provided( + &self, + docid: DocumentId, + embedder_id: u8, + regenerate: bool, + ) -> StdResult<(), SendError<()>> { + todo!() + } + + pub fn finish(self, embedder_id: u8) {} } pub enum MergerOperation { ExactWordDocidsMerger(Merger), From 65470e26e0d14a3b10086c0d376d96cb6e08f99b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:23:20 +0100 Subject: [PATCH 174/247] Document trait changes --- milli/src/update/new/document.rs | 110 +++++++++++++++++------- milli/src/update/new/document_change.rs | 68 +++++++++++---- 2 files changed, 129 insertions(+), 49 deletions(-) diff --git a/milli/src/update/new/document.rs b/milli/src/update/new/document.rs index 4948f8e31..be09feb5a 100644 --- a/milli/src/update/new/document.rs +++ b/milli/src/update/new/document.rs @@ -20,6 +20,14 @@ pub trait Document<'doc> { /// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning they are **not returned** by this method. fn iter_top_level_fields(&self) -> impl Iterator>; + fn len(&self) -> usize; + + fn is_empty(&self) -> bool { + self.len() == 0 + } + + fn top_level_field(&self, k: &str) -> Result>; + /// Returns the unparsed value of the `_vectors` field from the document data. /// /// This field alone is insufficient to retrieve vectors, as they may be stored in a dedicated location in the database. @@ -37,6 +45,7 @@ pub trait Document<'doc> { fn geo_field(&self) -> Result>; } +#[derive(Debug)] pub struct DocumentFromDb<'t, Mapper: FieldIdMapper> where Mapper: FieldIdMapper, @@ -84,6 +93,14 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { fn geo_field(&self) -> Result> { self.field("_geo") } + + fn len(&self) -> usize { + self.content.iter().count() + } + + fn top_level_field(&self, k: &str) -> Result> { + self.field(k) + } } impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> { @@ -107,18 +124,18 @@ impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> { } } -#[derive(Clone, Copy)] -pub struct DocumentFromVersions<'doc> { - versions: Versions<'doc>, +#[derive(Debug)] +pub struct DocumentFromVersions<'a, 'doc> { + versions: &'a Versions<'doc>, } -impl<'doc> DocumentFromVersions<'doc> { - pub fn new(versions: Versions<'doc>) -> Self { +impl<'a, 'doc> DocumentFromVersions<'a, 'doc> { + pub fn new(versions: &'a Versions<'doc>) -> Self { Self { versions } } } -impl<'doc> Document<'doc> for DocumentFromVersions<'doc> { +impl<'a, 'doc> Document<'doc> for DocumentFromVersions<'a, 'doc> { fn iter_top_level_fields(&self) -> impl Iterator> { self.versions.iter_top_level_fields().map(Ok) } @@ -130,16 +147,25 @@ impl<'doc> Document<'doc> for DocumentFromVersions<'doc> { fn geo_field(&self) -> Result> { Ok(self.versions.geo_field()) } + + fn len(&self) -> usize { + self.versions.len() + } + + fn top_level_field(&self, k: &str) -> Result> { + Ok(self.versions.top_level_field(k)) + } } -pub struct MergedDocument<'doc, 't, Mapper: FieldIdMapper> { - new_doc: DocumentFromVersions<'doc>, +#[derive(Debug)] +pub struct MergedDocument<'a, 'doc, 't, Mapper: FieldIdMapper> { + new_doc: DocumentFromVersions<'a, 'doc>, db: Option>, } -impl<'doc, 't, Mapper: FieldIdMapper> MergedDocument<'doc, 't, Mapper> { +impl<'a, 'doc, 't, Mapper: FieldIdMapper> MergedDocument<'a, 'doc, 't, Mapper> { pub fn new( - new_doc: DocumentFromVersions<'doc>, + new_doc: DocumentFromVersions<'a, 'doc>, db: Option>, ) -> Self { Self { new_doc, db } @@ -150,19 +176,19 @@ impl<'doc, 't, Mapper: FieldIdMapper> MergedDocument<'doc, 't, Mapper> { rtxn: &'t RoTxn, index: &'t Index, db_fields_ids_map: &'t Mapper, - new_doc: DocumentFromVersions<'doc>, + new_doc: DocumentFromVersions<'a, 'doc>, ) -> Result { let db = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)?; Ok(Self { new_doc, db }) } - pub fn without_db(new_doc: DocumentFromVersions<'doc>) -> Self { + pub fn without_db(new_doc: DocumentFromVersions<'a, 'doc>) -> Self { Self { new_doc, db: None } } } impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d> - for MergedDocument<'doc, 't, Mapper> + for MergedDocument<'d, 'doc, 't, Mapper> { fn iter_top_level_fields(&self) -> impl Iterator> { let mut new_doc_it = self.new_doc.iter_top_level_fields(); @@ -209,6 +235,20 @@ impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d> db.geo_field() } + + fn len(&self) -> usize { + self.iter_top_level_fields().count() + } + + fn top_level_field(&self, k: &str) -> Result> { + if let Some(f) = self.new_doc.top_level_field(k)? { + return Ok(Some(f)); + } + if let Some(db) = self.db { + return db.field(k); + } + Ok(None) + } } impl<'doc, D> Document<'doc> for &D @@ -226,6 +266,14 @@ where fn geo_field(&self) -> Result> { D::geo_field(self) } + + fn len(&self) -> usize { + D::len(self) + } + + fn top_level_field(&self, k: &str) -> Result> { + D::top_level_field(self, k) + } } /// Turn this document into an obkv, whose fields are indexed by the provided `FieldIdMapper`. @@ -301,11 +349,9 @@ where pub type Entry<'doc> = (&'doc str, &'doc RawValue); -#[derive(Clone, Copy)] +#[derive(Debug)] pub struct Versions<'doc> { - data: &'doc [Entry<'doc>], - vectors: Option<&'doc RawValue>, - geo: Option<&'doc RawValue>, + data: RawMap<'doc>, } impl<'doc> Versions<'doc> { @@ -324,26 +370,30 @@ impl<'doc> Versions<'doc> { } pub fn single(version: RawMap<'doc>) -> Self { - let vectors_id = version.get_index(RESERVED_VECTORS_FIELD_NAME); - let geo_id = version.get_index("_geo"); - let mut data = version.into_vec(); - let geo = geo_id.map(|geo_id| data.remove(geo_id).1); - let vectors = vectors_id.map(|vectors_id| data.remove(vectors_id).1); - - let data = data.into_bump_slice(); - - Self { data, geo, vectors } + Self { data: version } } - pub fn iter_top_level_fields(&self) -> impl Iterator> { - self.data.iter().copied() + pub fn iter_top_level_fields(&self) -> raw_collections::map::iter::Iter<'doc, '_> { + /// FIXME: ignore vectors and _geo + self.data.iter() } pub fn vectors_field(&self) -> Option<&'doc RawValue> { - self.vectors + self.data.get(RESERVED_VECTORS_FIELD_NAME) } pub fn geo_field(&self) -> Option<&'doc RawValue> { - self.geo + self.data.get("_geo") + } + + pub fn len(&self) -> usize { + self.data.len() + } + + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + pub fn top_level_field(&self, k: &str) -> Option<&'doc RawValue> { + self.data.get(k) } } diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index f277637d5..c55113b74 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -1,6 +1,8 @@ +use bumpalo::Bump; use heed::RoTxn; -use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument}; +use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions}; +use super::vector_document::{VectorDocumentFromDb, VectorDocumentFromVersions}; use crate::documents::FieldIdMapper; use crate::{DocumentId, Index, Result}; @@ -18,14 +20,14 @@ pub struct Deletion<'doc> { pub struct Update<'doc> { docid: DocumentId, external_document_id: &'doc str, - new: DocumentFromVersions<'doc>, + new: Versions<'doc>, has_deletion: bool, } pub struct Insertion<'doc> { docid: DocumentId, external_document_id: &'doc str, - new: DocumentFromVersions<'doc>, + new: Versions<'doc>, } impl<'doc> DocumentChange<'doc> { @@ -72,11 +74,7 @@ impl<'doc> Deletion<'doc> { } impl<'doc> Insertion<'doc> { - pub fn create( - docid: DocumentId, - external_document_id: &'doc str, - new: DocumentFromVersions<'doc>, - ) -> Self { + pub fn create(docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>) -> Self { Insertion { docid, external_document_id, new } } @@ -87,8 +85,15 @@ impl<'doc> Insertion<'doc> { pub fn external_document_id(&self) -> &'doc str { self.external_document_id } - pub fn new(&self) -> DocumentFromVersions<'doc> { - self.new + pub fn new(&self) -> DocumentFromVersions<'_, 'doc> { + DocumentFromVersions::new(&self.new) + } + + pub fn inserted_vectors( + &self, + doc_alloc: &'doc Bump, + ) -> Result>> { + VectorDocumentFromVersions::new(&self.new, doc_alloc) } } @@ -96,7 +101,7 @@ impl<'doc> Update<'doc> { pub fn create( docid: DocumentId, external_document_id: &'doc str, - new: DocumentFromVersions<'doc>, + new: Versions<'doc>, has_deletion: bool, ) -> Self { Update { docid, new, external_document_id, has_deletion } @@ -120,20 +125,45 @@ impl<'doc> Update<'doc> { )?) } - pub fn updated(&self) -> DocumentFromVersions<'doc> { - self.new - } - - pub fn new<'a, Mapper: FieldIdMapper>( + pub fn current_vectors<'a, Mapper: FieldIdMapper>( &self, rtxn: &'a RoTxn, index: &'a Index, mapper: &'a Mapper, - ) -> Result> { + doc_alloc: &'a Bump, + ) -> Result> { + Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or( + crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, + )?) + } + + pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> { + DocumentFromVersions::new(&self.new) + } + + pub fn new<'t, Mapper: FieldIdMapper>( + &self, + rtxn: &'t RoTxn, + index: &'t Index, + mapper: &'t Mapper, + ) -> Result> { if self.has_deletion { - Ok(MergedDocument::without_db(self.new)) + Ok(MergedDocument::without_db(DocumentFromVersions::new(&self.new))) } else { - MergedDocument::with_db(self.docid, rtxn, index, mapper, self.new) + MergedDocument::with_db( + self.docid, + rtxn, + index, + mapper, + DocumentFromVersions::new(&self.new), + ) } } + + pub fn updated_vectors( + &self, + doc_alloc: &'doc Bump, + ) -> Result>> { + VectorDocumentFromVersions::new(&self.new, doc_alloc) + } } From 5efd70c2518c892e5ba00ee419cc2f1839a368a7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:23:38 +0100 Subject: [PATCH 175/247] Allow random access to fields in documents --- .../extract/searchable/tokenize_document.rs | 9 +++- .../update/new/indexer/document_changes.rs | 5 +- .../update/new/indexer/document_deletion.rs | 5 +- .../update/new/indexer/document_operation.rs | 12 ++--- milli/src/update/new/indexer/mod.rs | 9 +++- milli/src/update/new/indexer/partial_dump.rs | 4 +- .../update/new/indexer/update_by_function.rs | 5 +- milli/src/update/new/vector_document.rs | 53 +++++++++++++++++-- 8 files changed, 77 insertions(+), 25 deletions(-) diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index 5428907f8..793e3a249 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -176,6 +176,7 @@ mod test { use serde_json::value::RawValue; use super::*; + use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::update::new::document::{DocumentFromVersions, Versions}; use crate::FieldsIdsMap; @@ -212,6 +213,11 @@ mod test { max_positions_per_attributes: 1000, }; + let fields_ids_map = FieldIdMapWithMetadata::new( + fields_ids_map, + MetadataBuilder::new(Default::default(), Default::default(), Default::default(), None), + ); + let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map); let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock); @@ -223,7 +229,8 @@ mod test { let document: &RawValue = serde_json::from_str(&document).unwrap(); let document = RawMap::from_raw_value(document, &bump).unwrap(); - let document = DocumentFromVersions::new(Versions::single(document)); + let document = Versions::single(document); + let document = DocumentFromVersions::new(&document); document_tokenizer .tokenize_document( diff --git a/milli/src/update/new/indexer/document_changes.rs b/milli/src/update/new/indexer/document_changes.rs index 91c65a6d1..fd16137b9 100644 --- a/milli/src/update/new/indexer/document_changes.rs +++ b/milli/src/update/new/indexer/document_changes.rs @@ -7,6 +7,7 @@ use raw_collections::alloc::RefBump; use rayon::iter::IndexedParallelIterator; use super::super::document_change::DocumentChange; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result}; @@ -278,7 +279,7 @@ impl< pub fn new( index: &'indexer Index, db_fields_ids_map: &'indexer FieldsIdsMap, - new_fields_ids_map: &'fid RwLock, + new_fields_ids_map: &'fid RwLock, extractor_allocs: &'extractor ThreadLocal>>, doc_allocs: &'doc ThreadLocal>>, datastore: &'data ThreadLocal, @@ -351,7 +352,7 @@ pub struct IndexingContext< > { pub index: &'index Index, pub db_fields_ids_map: &'indexer FieldsIdsMap, - pub new_fields_ids_map: &'fid RwLock, + pub new_fields_ids_map: &'fid RwLock, pub doc_allocs: &'indexer ThreadLocal>>, pub fields_ids_map_store: &'indexer ThreadLocal>>>, } diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index bbd2b11ac..d193b65fa 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -86,6 +86,7 @@ mod test { use bumpalo::Bump; use raw_collections::alloc::RefBump; + use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::tests::TempIndex; use crate::update::new::indexer::document_changes::{ for_each_document_change, DocumentChangeContext, Extractor, IndexingContext, MostlySend, @@ -144,7 +145,9 @@ mod test { let rtxn = index.read_txn().unwrap(); let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let fields_ids_map = RwLock::new(db_fields_ids_map.clone()); + let metadata_builder = MetadataBuilder::from_index(&index, &rtxn).unwrap(); + let fields_ids_map = + RwLock::new(FieldIdMapWithMetadata::new(db_fields_ids_map.clone(), metadata_builder)); let fields_ids_map_store = ThreadLocal::new(); diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 007b56643..bc1634d75 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -289,19 +289,17 @@ impl MergeChanges for MergeDocumentForReplacement { let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) .map_err(UserError::SerdeJson)?; - let document = DocumentFromVersions::new(Versions::single(document)); - if is_new { Ok(Some(DocumentChange::Insertion(Insertion::create( docid, external_doc, - document, + Versions::single(document), )))) } else { Ok(Some(DocumentChange::Update(Update::create( docid, external_doc, - document, + Versions::single(document), true, )))) } @@ -396,15 +394,13 @@ impl MergeChanges for MergeDocumentForUpdates { let Some(versions) = versions else { return Ok(None) }; - let document = DocumentFromVersions::new(versions); - if is_new { - Ok(Some(DocumentChange::Insertion(Insertion::create(docid, external_docid, document)))) + Ok(Some(DocumentChange::Insertion(Insertion::create(docid, external_docid, versions)))) } else { Ok(Some(DocumentChange::Update(Update::create( docid, external_docid, - document, + versions, has_deletion, )))) } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 0fc7940bb..dd2506ef9 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -28,6 +28,7 @@ use super::words_prefix_docids::{ use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::facet::FacetType; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::proximity::ProximityPrecision; use crate::update::new::channel::ExtractorSender; use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; @@ -122,6 +123,10 @@ where // This channel acts as a rendezvous point to ensure that we are one task ahead let (extractor_sender, merger_receiver) = extractors_merger_channels(4); + let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; + + let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); + let new_fields_ids_map = RwLock::new(new_fields_ids_map); let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads()); @@ -298,8 +303,8 @@ where // required to into_inner the new_fields_ids_map drop(fields_ids_map_store); - let fields_ids_map = new_fields_ids_map.into_inner().unwrap(); - index.put_fields_ids_map(wtxn, &fields_ids_map)?; + let new_fields_ids_map = new_fields_ids_map.into_inner().unwrap(); + index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?; if let Some(new_primary_key) = new_primary_key { index.put_primary_key(wtxn, new_primary_key.name())?; diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 60cb627e9..3913098ec 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -76,9 +76,7 @@ where let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) .map_err(InternalError::SerdeJson)?; - let document = DocumentFromVersions::new(Versions::single(document)); - - let insertion = Insertion::create(docid, external_document_id, document); + let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); Ok(Some(DocumentChange::Insertion(insertion))) } } diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index cff0e02fc..b08f8c380 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -160,12 +160,11 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { } else { let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc) .map_err(InternalError::SerdeJson)?; - let new_doc_version = - DocumentFromVersions::new(Versions::single(raw_new_doc)); + Ok(Some(DocumentChange::Update(Update::create( docid, new_document_id, - new_doc_version, + Versions::single(raw_new_doc), true, // It is like document replacement )))) } diff --git a/milli/src/update/new/vector_document.rs b/milli/src/update/new/vector_document.rs index 375d4f2ce..782076716 100644 --- a/milli/src/update/new/vector_document.rs +++ b/milli/src/update/new/vector_document.rs @@ -4,12 +4,12 @@ use raw_collections::RawMap; use serde::Serialize; use serde_json::value::RawValue; -use super::document::{Document, DocumentFromDb}; +use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions}; use crate::documents::FieldIdMapper; use crate::index::IndexEmbeddingConfig; use crate::vector::parsed_vectors::RawVectors; use crate::vector::Embedding; -use crate::{DocumentId, Index, InternalError, Result}; +use crate::{DocumentId, Index, InternalError, Result, UserError}; #[derive(Serialize)] #[serde(untagged)] @@ -17,6 +17,15 @@ pub enum Embeddings<'doc> { FromJson(&'doc RawValue), FromDb(Vec), } +impl<'doc> Embeddings<'doc> { + pub fn into_vec(self) -> std::result::Result, serde_json::Error> { + match self { + /// FIXME: this should be a VecOrArrayOfVec + Embeddings::FromJson(value) => serde_json::from_str(value.get()), + Embeddings::FromDb(vec) => Ok(vec), + } + } +} pub struct VectorEntry<'doc> { pub has_configured_embedder: bool, @@ -46,8 +55,10 @@ impl<'t> VectorDocumentFromDb<'t> { rtxn: &'t RoTxn, db_fields_ids_map: &'t Mapper, doc_alloc: &'t Bump, - ) -> Result { - let document = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)?.unwrap(); + ) -> Result> { + let Some(document) = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)? else { + return Ok(None); + }; let vectors = document.vectors_field()?; let vectors_field = match vectors { Some(vectors) => { @@ -58,7 +69,7 @@ impl<'t> VectorDocumentFromDb<'t> { let embedding_config = index.embedding_configs(rtxn)?; - Ok(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc }) + Ok(Some(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc })) } fn entry_from_db( @@ -132,3 +143,35 @@ fn entry_from_raw_value( regenerate: value.must_regenerate(), }) } + +pub struct VectorDocumentFromVersions<'doc> { + vectors: RawMap<'doc>, +} + +impl<'doc> VectorDocumentFromVersions<'doc> { + pub fn new(versions: &Versions<'doc>, bump: &'doc Bump) -> Result> { + let document = DocumentFromVersions::new(versions); + if let Some(vectors_field) = document.vectors_field()? { + let vectors = + RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; + Ok(Some(Self { vectors })) + } else { + Ok(None) + } + } +} + +impl<'doc> VectorDocument<'doc> for VectorDocumentFromVersions<'doc> { + fn iter_vectors(&self) -> impl Iterator)>> { + self.vectors.iter().map(|(embedder, vectors)| { + let vectors = entry_from_raw_value(vectors).map_err(UserError::SerdeJson)?; + Ok((embedder, vectors)) + }) + } + + fn vectors_for_key(&self, key: &str) -> Result>> { + let Some(vectors) = self.vectors.get(key) else { return Ok(None) }; + let vectors = entry_from_raw_value(vectors).map_err(UserError::SerdeJson)?; + Ok(Some(vectors)) + } +} From 9cbb2b066adc3d42b6210f187277d101679f06b0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:23:54 +0100 Subject: [PATCH 176/247] WIP vector extraction --- milli/src/update/new/extract/vectors/mod.rs | 315 ++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 milli/src/update/new/extract/vectors/mod.rs diff --git a/milli/src/update/new/extract/vectors/mod.rs b/milli/src/update/new/extract/vectors/mod.rs new file mode 100644 index 000000000..87b126207 --- /dev/null +++ b/milli/src/update/new/extract/vectors/mod.rs @@ -0,0 +1,315 @@ +use crate::error::FaultSource; +use crate::prompt::Prompt; +use crate::update::new::channel::EmbeddingSender; +use crate::update::new::indexer::document_changes::{Extractor, FullySend}; +use crate::update::new::vector_document::VectorDocument; +use crate::update::new::DocumentChange; +use crate::vector::error::EmbedErrorKind; +use crate::vector::Embedder; +use crate::{DocumentId, Result, ThreadPoolNoAbort, UserError}; + +pub struct EmbeddingExtractor<'a> { + embedder: &'a Embedder, + prompt: &'a Prompt, + embedder_id: u8, + embedder_name: &'a str, + sender: &'a EmbeddingSender<'a>, + threads: &'a ThreadPoolNoAbort, +} + +impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { + type Data = FullySend<()>; + + fn init_data<'doc>( + &'doc self, + _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, + ) -> crate::Result { + Ok(FullySend(())) + } + + fn process<'doc>( + &'doc self, + changes: impl Iterator>>, + context: &'doc crate::update::new::indexer::document_changes::DocumentChangeContext< + Self::Data, + >, + ) -> crate::Result<()> { + let embedder_name: &str = self.embedder_name; + let embedder: &Embedder = self.embedder; + let prompt: &Prompt = self.prompt; + + let mut chunks = Chunks::new( + embedder, + self.embedder_id, + embedder_name, + self.threads, + self.sender, + &context.doc_alloc, + ); + + for change in changes { + let change = change?; + match change { + DocumentChange::Deletion(deletion) => { + self.sender.delete(deletion.docid(), self.embedder_id).unwrap(); + } + DocumentChange::Update(update) => { + /// FIXME: this will force the parsing/retrieval of VectorDocument once per embedder + /// consider doing all embedders at once? + let old_vectors = update.current_vectors( + &context.txn, + context.index, + context.db_fields_ids_map, + &context.doc_alloc, + )?; + let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap(); + let new_vectors = update.updated_vectors(&context.doc_alloc)?; + if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { + new_vectors.vectors_for_key(embedder_name).transpose() + }) { + let new_vectors = new_vectors?; + match (old_vectors.regenerate, new_vectors.regenerate) { + (true, true) | (false, false) => todo!(), + _ => { + self.sender + .set_user_provided( + update.docid(), + self.embedder_id, + !new_vectors.regenerate, + ) + .unwrap(); + } + } + // do we have set embeddings? + if let Some(embeddings) = new_vectors.embeddings { + self.sender + .set_vectors( + update.docid(), + self.embedder_id, + embeddings.into_vec().map_err(UserError::SerdeJson)?, + ) + .unwrap(); + } else if new_vectors.regenerate { + let new_rendered = prompt.render_document( + update.current( + &context.txn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + )?; + let old_rendered = prompt.render_document( + update.new( + &context.txn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + )?; + if new_rendered != old_rendered { + chunks.push(update.docid(), new_rendered)?; + } + } + } else if old_vectors.regenerate { + let old_rendered = prompt.render_document( + update.current( + &context.txn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + )?; + let new_rendered = prompt.render_document( + update.new(&context.txn, context.index, context.db_fields_ids_map)?, + context.new_fields_ids_map, + &context.doc_alloc, + )?; + if new_rendered != old_rendered { + chunks.push(update.docid(), new_rendered)?; + } + } + } + DocumentChange::Insertion(insertion) => { + // if no inserted vectors, then regenerate: true + no embeddings => autogenerate + let new_vectors = insertion.inserted_vectors(&context.doc_alloc)?; + if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { + new_vectors.vectors_for_key(embedder_name).transpose() + }) { + let new_vectors = new_vectors?; + self.sender + .set_user_provided( + insertion.docid(), + self.embedder_id, + !new_vectors.regenerate, + ) + .unwrap(); + if let Some(embeddings) = new_vectors.embeddings { + self.sender + .set_vectors( + insertion.docid(), + self.embedder_id, + embeddings.into_vec().map_err(UserError::SerdeJson)?, + ) + .unwrap(); + } else if new_vectors.regenerate { + let rendered = prompt.render_document( + insertion.new(), + context.new_fields_ids_map, + &context.doc_alloc, + )?; + chunks.push(insertion.docid(), rendered)?; + } + } else { + let rendered = prompt.render_document( + insertion.new(), + context.new_fields_ids_map, + &context.doc_alloc, + )?; + chunks.push(insertion.docid(), rendered)?; + } + } + } + } + + chunks.drain() + } +} + +use bumpalo::collections::Vec as BVec; +use bumpalo::Bump; + +// **Warning**: the destructor of this struct is not normally run, make sure that all its fields: +// 1. don't have side effects tied to they destructors +// 2. if allocated, are allocated inside of the bumpalo +// +// Currently this is the case as: +// 1. BVec are inside of the bumaplo +// 2. All other fields are either trivial (u8) or references. +struct Chunks<'a> { + texts: BVec<'a, &'a str>, + ids: BVec<'a, DocumentId>, + + embedder: &'a Embedder, + embedder_id: u8, + embedder_name: &'a str, + threads: &'a ThreadPoolNoAbort, + sender: &'a EmbeddingSender<'a>, +} + +impl<'a> Chunks<'a> { + pub fn new( + embedder: &'a Embedder, + embedder_id: u8, + embedder_name: &'a str, + threads: &'a ThreadPoolNoAbort, + sender: &'a EmbeddingSender<'a>, + doc_alloc: &'a Bump, + ) -> Self { + let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); + let texts = BVec::with_capacity_in(capacity, doc_alloc); + let ids = BVec::with_capacity_in(capacity, doc_alloc); + Self { texts, ids, embedder, threads, sender, embedder_id, embedder_name } + } + + pub fn push(&mut self, docid: DocumentId, rendered: &'a str) -> Result<()> { + if self.texts.len() < self.texts.capacity() { + self.texts.push(rendered); + self.ids.push(docid); + return Ok(()); + } + + Self::embed_chunks( + &mut self.texts, + &mut self.ids, + self.embedder, + self.embedder_id, + self.embedder_name, + self.threads, + self.sender, + ) + } + + pub fn drain(mut self) -> Result<()> { + let res = Self::embed_chunks( + &mut self.texts, + &mut self.ids, + self.embedder, + self.embedder_id, + self.embedder_name, + self.threads, + self.sender, + ); + // optimization: don't run bvec dtors as they only contain bumpalo allocated stuff + std::mem::forget(self); + res + } + + pub fn embed_chunks( + texts: &mut BVec<'a, &'a str>, + ids: &mut BVec<'a, DocumentId>, + embedder: &'a Embedder, + embedder_id: u8, + embedder_name: &str, + threads: &'a ThreadPoolNoAbort, + sender: &'a EmbeddingSender<'a>, + ) -> Result<()> { + let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) { + Ok(embeddings) => { + for (docid, embedding) in ids.into_iter().zip(embeddings) { + sender.set_vector(*docid, embedder_id, embedding).unwrap(); + } + Ok(()) + } + Err(error) => { + if let FaultSource::Bug = error.fault { + Err(crate::Error::InternalError(crate::InternalError::VectorEmbeddingError( + error.into(), + ))) + } else { + let mut msg = format!( + r"While embedding documents for embedder `{embedder_name}`: {error}" + ); + + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); + } + + /// FIXME: reintroduce possible_embedding_mistakes and possible_embedding_mistakes + let mut hint_count = 0; + + /* + for (vector_misspelling, count) in + possible_embedding_mistakes.vector_mistakes().take(2) + { + msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); + hint_count += 1; + } + + for (embedder_misspelling, count) in possible_embedding_mistakes + .embedder_mistakes(embedder_name, unused_vectors_distribution) + .take(2) + { + msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); + hint_count += 1; + } + */ + if hint_count == 0 { + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!( + "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" + ); + } + } + + Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))) + } + } + }; + texts.clear(); + ids.clear(); + res + } +} From 7058959a4644a6ea49c482277f3bfa37f3784c71 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 16:18:48 +0100 Subject: [PATCH 177/247] Write into documents --- milli/src/update/new/document.rs | 55 +++++++------ milli/src/update/new/document_change.rs | 22 ++++- .../new/extract/faceted/extract_facets.rs | 4 +- .../extract/searchable/extract_word_docids.rs | 4 +- .../extract_word_pair_proximity_docids.rs | 4 +- milli/src/update/new/extract/vectors/mod.rs | 12 ++- milli/src/update/new/indexer/mod.rs | 32 +++++--- milli/src/update/new/vector_document.rs | 81 ++++++++++++++++--- 8 files changed, 154 insertions(+), 60 deletions(-) diff --git a/milli/src/update/new/document.rs b/milli/src/update/new/document.rs index be09feb5a..0a5172d36 100644 --- a/milli/src/update/new/document.rs +++ b/milli/src/update/new/document.rs @@ -1,13 +1,14 @@ -use std::collections::BTreeSet; +use std::collections::{BTreeMap, BTreeSet}; use heed::RoTxn; use raw_collections::RawMap; use serde_json::value::RawValue; +use super::vector_document::{VectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions}; use super::{KvReaderFieldId, KvWriterFieldId}; use crate::documents::FieldIdMapper; use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; -use crate::{DocumentId, Index, InternalError, Result}; +use crate::{DocumentId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError}; /// A view into a document that can represent either the current version from the DB, /// the update data from payload or other means, or the merged updated version. @@ -69,17 +70,22 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { std::iter::from_fn(move || { let (fid, value) = it.next()?; - let res = (|| { - let value = - serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?; - + let res = (|| loop { let name = self.fields_ids_map.name(fid).ok_or( InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId { field_id: fid, process: "getting current document", }), )?; - Ok((name, value)) + + if name == RESERVED_VECTORS_FIELD_NAME || name == "_geo" { + continue; + } + + let value = + serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?; + + return Ok((name, value)); })(); Some(res) @@ -164,13 +170,6 @@ pub struct MergedDocument<'a, 'doc, 't, Mapper: FieldIdMapper> { } impl<'a, 'doc, 't, Mapper: FieldIdMapper> MergedDocument<'a, 'doc, 't, Mapper> { - pub fn new( - new_doc: DocumentFromVersions<'a, 'doc>, - db: Option>, - ) -> Self { - Self { new_doc, db } - } - pub fn with_db( docid: DocumentId, rtxn: &'t RoTxn, @@ -287,15 +286,14 @@ where /// /// - If the document contains a top-level field that is not present in `fields_ids_map`. /// -pub fn write_to_obkv<'s, 'a, 'b>( +pub fn write_to_obkv<'s, 'a, 'map>( document: &'s impl Document<'s>, - vector_document: Option<()>, - fields_ids_map: &'a impl FieldIdMapper, + vector_document: Option<&'s impl VectorDocument<'s>>, + fields_ids_map: &'a mut GlobalFieldsIdsMap<'map>, mut document_buffer: &'a mut Vec, ) -> Result<&'a KvReaderFieldId> where 's: 'a, - 's: 'b, { // will be used in 'inject_vectors let vectors_value: Box; @@ -308,19 +306,21 @@ where for res in document.iter_top_level_fields() { let (field_name, value) = res?; - let field_id = fields_ids_map.id(field_name).unwrap(); + let field_id = + fields_ids_map.id_or_insert(field_name).ok_or(UserError::AttributeLimitReached)?; unordered_field_buffer.push((field_id, value)); } 'inject_vectors: { let Some(vector_document) = vector_document else { break 'inject_vectors }; - let Some(vectors_fid) = fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME) else { - break 'inject_vectors; - }; - /* + let vectors_fid = fields_ids_map + .id_or_insert(RESERVED_VECTORS_FIELD_NAME) + .ok_or(UserError::AttributeLimitReached)?; + let mut vectors = BTreeMap::new(); - for (name, entry) in vector_document.iter_vectors() { + for res in vector_document.iter_vectors() { + let (name, entry) = res?; if entry.has_configured_embedder { continue; // we don't write vectors with configured embedder in documents } @@ -335,7 +335,7 @@ where } vectors_value = serde_json::value::to_raw_value(&vectors).unwrap(); - unordered_field_buffer.push((vectors_fid, &vectors_value));*/ + unordered_field_buffer.push((vectors_fid, &vectors_value)); } unordered_field_buffer.sort_by_key(|(fid, _)| *fid); @@ -373,9 +373,8 @@ impl<'doc> Versions<'doc> { Self { data: version } } - pub fn iter_top_level_fields(&self) -> raw_collections::map::iter::Iter<'doc, '_> { - /// FIXME: ignore vectors and _geo - self.data.iter() + pub fn iter_top_level_fields(&self) -> impl Iterator + '_ { + self.data.iter().filter(|(k, _)| *k != RESERVED_VECTORS_FIELD_NAME && *k != "_geo") } pub fn vectors_field(&self) -> Option<&'doc RawValue> { diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index c55113b74..bb1fc9441 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -2,7 +2,9 @@ use bumpalo::Bump; use heed::RoTxn; use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions}; -use super::vector_document::{VectorDocumentFromDb, VectorDocumentFromVersions}; +use super::vector_document::{ + MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, +}; use crate::documents::FieldIdMapper; use crate::{DocumentId, Index, Result}; @@ -85,7 +87,7 @@ impl<'doc> Insertion<'doc> { pub fn external_document_id(&self) -> &'doc str { self.external_document_id } - pub fn new(&self) -> DocumentFromVersions<'_, 'doc> { + pub fn inserted(&self) -> DocumentFromVersions<'_, 'doc> { DocumentFromVersions::new(&self.new) } @@ -141,7 +143,7 @@ impl<'doc> Update<'doc> { DocumentFromVersions::new(&self.new) } - pub fn new<'t, Mapper: FieldIdMapper>( + pub fn merged<'t, Mapper: FieldIdMapper>( &self, rtxn: &'t RoTxn, index: &'t Index, @@ -166,4 +168,18 @@ impl<'doc> Update<'doc> { ) -> Result>> { VectorDocumentFromVersions::new(&self.new, doc_alloc) } + + pub fn merged_vectors( + &self, + rtxn: &'doc RoTxn, + index: &'doc Index, + mapper: &'doc Mapper, + doc_alloc: &'doc Bump, + ) -> Result>> { + if self.has_deletion { + MergedVectorDocument::without_db(&self.new, doc_alloc) + } else { + MergedVectorDocument::with_db(self.docid, index, rtxn, mapper, &self.new, doc_alloc) + } + } } diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 9fae1839e..f2cbad6ff 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -120,7 +120,7 @@ impl FacetedDocidsExtractor { extract_document_facets( attributes_to_extract, - inner.new(rtxn, index, context.db_fields_ids_map)?, + inner.merged(rtxn, index, context.db_fields_ids_map)?, new_fields_ids_map.deref_mut(), &mut |fid, value| { Self::facet_fn_with_options( @@ -136,7 +136,7 @@ impl FacetedDocidsExtractor { } DocumentChange::Insertion(inner) => extract_document_facets( attributes_to_extract, - inner.new(), + inner.inserted(), new_fields_ids_map.deref_mut(), &mut |fid, value| { Self::facet_fn_with_options( diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 5eb9692d6..80f36b01d 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -481,7 +481,7 @@ impl WordDocidsExtractors { .map_err(crate::Error::from) }; document_tokenizer.tokenize_document( - inner.new(rtxn, index, context.db_fields_ids_map)?, + inner.merged(rtxn, index, context.db_fields_ids_map)?, new_fields_ids_map, &mut token_fn, )?; @@ -500,7 +500,7 @@ impl WordDocidsExtractors { .map_err(crate::Error::from) }; document_tokenizer.tokenize_document( - inner.new(), + inner.inserted(), new_fields_ids_map, &mut token_fn, )?; diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 53e6515a9..1bd3aee36 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -80,7 +80,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { del_word_pair_proximity.push(((w1, w2), prox)); }, )?; - let document = inner.new(rtxn, index, context.db_fields_ids_map)?; + let document = inner.merged(rtxn, index, context.db_fields_ids_map)?; process_document_tokens( document, document_tokenizer, @@ -92,7 +92,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { )?; } DocumentChange::Insertion(inner) => { - let document = inner.new(); + let document = inner.inserted(); process_document_tokens( document, document_tokenizer, diff --git a/milli/src/update/new/extract/vectors/mod.rs b/milli/src/update/new/extract/vectors/mod.rs index 87b126207..a2762ae7a 100644 --- a/milli/src/update/new/extract/vectors/mod.rs +++ b/milli/src/update/new/extract/vectors/mod.rs @@ -100,7 +100,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { &context.doc_alloc, )?; let old_rendered = prompt.render_document( - update.new( + update.merged( &context.txn, context.index, context.db_fields_ids_map, @@ -123,7 +123,11 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { &context.doc_alloc, )?; let new_rendered = prompt.render_document( - update.new(&context.txn, context.index, context.db_fields_ids_map)?, + update.merged( + &context.txn, + context.index, + context.db_fields_ids_map, + )?, context.new_fields_ids_map, &context.doc_alloc, )?; @@ -156,7 +160,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { .unwrap(); } else if new_vectors.regenerate { let rendered = prompt.render_document( - insertion.new(), + insertion.inserted(), context.new_fields_ids_map, &context.doc_alloc, )?; @@ -164,7 +168,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { } } else { let rendered = prompt.render_document( - insertion.new(), + insertion.inserted(), context.new_fields_ids_map, &context.doc_alloc, )?; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index dd2506ef9..b316cbc34 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -64,9 +64,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { ) -> Result<()> { let mut document_buffer = Vec::new(); - let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield(); - let new_fields_ids_map = &*new_fields_ids_map; - let new_fields_ids_map = new_fields_ids_map.local_map(); + let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); for change in changes { let change = change?; @@ -78,20 +76,34 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { let docid = deletion.docid(); self.document_sender.delete(docid, external_docid).unwrap(); } - /// TODO: change NONE by SOME(vector) when implemented DocumentChange::Update(update) => { let docid = update.docid(); let content = - update.new(&context.txn, context.index, &context.db_fields_ids_map)?; - let content = - write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; + update.merged(&context.txn, context.index, &context.db_fields_ids_map)?; + let vector_content = update.merged_vectors( + &context.txn, + context.index, + &context.db_fields_ids_map, + &context.doc_alloc, + )?; + let content = write_to_obkv( + &content, + vector_content.as_ref(), + &mut new_fields_ids_map, + &mut document_buffer, + )?; self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); } DocumentChange::Insertion(insertion) => { let docid = insertion.docid(); - let content = insertion.new(); - let content = - write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; + let content = insertion.inserted(); + let inserted_vectors = insertion.inserted_vectors(&context.doc_alloc)?; + let content = write_to_obkv( + &content, + inserted_vectors.as_ref(), + &mut new_fields_ids_map, + &mut document_buffer, + )?; self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); // extracted_dictionary_sender.send(self, dictionary: &[u8]); } diff --git a/milli/src/update/new/vector_document.rs b/milli/src/update/new/vector_document.rs index 782076716..a5519a025 100644 --- a/milli/src/update/new/vector_document.rs +++ b/milli/src/update/new/vector_document.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeSet; + use bumpalo::Bump; use heed::RoTxn; use raw_collections::RawMap; @@ -106,14 +108,9 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { let config_name = self.doc_alloc.alloc_str(config.name.as_str()); Ok((&*config_name, entry)) }) - .chain(self.vectors_field.iter().map(|map| map.iter()).flatten().map( - |(name, value)| { - Ok(( - name.as_ref(), - entry_from_raw_value(value).map_err(InternalError::SerdeJson)?, - )) - }, - )) + .chain(self.vectors_field.iter().flat_map(|map| map.iter()).map(|(name, value)| { + Ok((name, entry_from_raw_value(value).map_err(InternalError::SerdeJson)?)) + })) } fn vectors_for_key(&self, key: &str) -> Result>> { @@ -139,7 +136,7 @@ fn entry_from_raw_value( let value: RawVectors = serde_json::from_str(value.get())?; Ok(VectorEntry { has_configured_embedder: false, - embeddings: value.embeddings().map(|embeddings| Embeddings::FromJson(embeddings)), + embeddings: value.embeddings().map(Embeddings::FromJson), regenerate: value.must_regenerate(), }) } @@ -175,3 +172,69 @@ impl<'doc> VectorDocument<'doc> for VectorDocumentFromVersions<'doc> { Ok(Some(vectors)) } } + +pub struct MergedVectorDocument<'doc> { + new_doc: Option>, + db: Option>, +} + +impl<'doc> MergedVectorDocument<'doc> { + pub fn with_db( + docid: DocumentId, + index: &'doc Index, + rtxn: &'doc RoTxn, + db_fields_ids_map: &'doc Mapper, + versions: &Versions<'doc>, + doc_alloc: &'doc Bump, + ) -> Result> { + let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?; + let new_doc = VectorDocumentFromVersions::new(versions, doc_alloc)?; + Ok(if db.is_none() && new_doc.is_none() { None } else { Some(Self { new_doc, db }) }) + } + + pub fn without_db(versions: &Versions<'doc>, doc_alloc: &'doc Bump) -> Result> { + let Some(new_doc) = VectorDocumentFromVersions::new(versions, doc_alloc)? else { + return Ok(None); + }; + Ok(Some(Self { new_doc: Some(new_doc), db: None })) + } +} + +impl<'doc> VectorDocument<'doc> for MergedVectorDocument<'doc> { + fn iter_vectors(&self) -> impl Iterator)>> { + let mut new_doc_it = self.new_doc.iter().flat_map(|new_doc| new_doc.iter_vectors()); + let mut db_it = self.db.iter().flat_map(|db| db.iter_vectors()); + let mut seen_fields = BTreeSet::new(); + + std::iter::from_fn(move || { + if let Some(next) = new_doc_it.next() { + if let Ok((name, _)) = next { + seen_fields.insert(name); + } + return Some(next); + } + loop { + match db_it.next()? { + Ok((name, value)) => { + if seen_fields.contains(name) { + continue; + } + return Some(Ok((name, value))); + } + Err(err) => return Some(Err(err)), + } + } + }) + } + + fn vectors_for_key(&self, key: &str) -> Result>> { + if let Some(new_doc) = &self.new_doc { + if let Some(entry) = new_doc.vectors_for_key(key)? { + return Ok(Some(entry)); + } + } + + let Some(db) = self.db.as_ref() else { return Ok(None) }; + db.vectors_for_key(key) + } +} From 1075dd34bb070df1b753d00c3385e4fbecda5f01 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 29 Oct 2024 17:43:36 +0100 Subject: [PATCH 178/247] Vectors --- index-scheduler/src/batch.rs | 9 + milli/src/update/new/channel.rs | 101 +++++--- milli/src/update/new/document.rs | 2 +- milli/src/update/new/extract/cache.rs | 2 +- milli/src/update/new/extract/mod.rs | 1 + milli/src/update/new/extract/vectors/mod.rs | 267 ++++++++++++-------- milli/src/update/new/indexer/mod.rs | 186 +++++++++++++- milli/src/update/new/merger.rs | 1 + 8 files changed, 420 insertions(+), 149 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index fdf213a6b..60393e51d 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -1300,6 +1300,8 @@ impl IndexScheduler { let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(method); + let embedders = index.embedding_configs(index_wtxn)?; + let embedders = self.embedders(embedders)?; for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) { match operation { DocumentOperation::Add(_content_uuid) => { @@ -1374,6 +1376,7 @@ impl IndexScheduler { primary_key_has_been_set.then_some(primary_key), &pool, &document_changes, + embedders, )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); @@ -1460,6 +1463,8 @@ impl IndexScheduler { let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); let document_changes = indexer.into_changes(&primary_key)?; + let embedders = index.embedding_configs(index_wtxn)?; + let embedders = self.embedders(embedders)?; indexer::index( index_wtxn, @@ -1469,6 +1474,7 @@ impl IndexScheduler { None, // cannot change primary key in DocumentEdition &pool, &document_changes, + embedders, )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); @@ -1596,6 +1602,8 @@ impl IndexScheduler { let mut indexer = indexer::DocumentDeletion::new(); indexer.delete_documents_by_docids(to_delete); let document_changes = indexer.into_changes(&indexer_alloc, primary_key); + let embedders = index.embedding_configs(index_wtxn)?; + let embedders = self.embedders(embedders)?; indexer::index( index_wtxn, @@ -1605,6 +1613,7 @@ impl IndexScheduler { None, // document deletion never changes primary key &pool, &document_changes, + embedders, )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index 657c00141..92f692a88 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -3,6 +3,7 @@ use std::marker::PhantomData; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use grenad::Merger; +use hashbrown::HashMap; use heed::types::Bytes; use memmap2::Mmap; use roaring::RoaringBitmap; @@ -124,7 +125,32 @@ impl DocumentDeletionEntry { } } -pub struct WriterOperation { +pub enum WriterOperation { + DbOperation(DbOperation), + ArroyOperation(ArroyOperation), +} + +pub enum ArroyOperation { + /// TODO: call when deleting regular documents + DeleteVectors { + docid: DocumentId, + }, + SetVectors { + docid: DocumentId, + embedder_id: u8, + embeddings: Vec, + }, + SetVector { + docid: DocumentId, + embedder_id: u8, + embedding: Embedding, + }, + Finish { + user_provided: HashMap, + }, +} + +pub struct DbOperation { database: Database, entry: EntryOperation, } @@ -180,7 +206,7 @@ impl From for Database { } } -impl WriterOperation { +impl DbOperation { pub fn database(&self, index: &Index) -> heed::Database { self.database.database(index) } @@ -246,13 +272,13 @@ impl MergerSender { DOCUMENTS_IDS_KEY.as_bytes(), documents_ids, )); - match self.send(WriterOperation { database: Database::Main, entry }) { + match self.send_db_operation(DbOperation { database: Database::Main, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } } - fn send(&self, op: WriterOperation) -> StdResult<(), SendError<()>> { + fn send_db_operation(&self, op: DbOperation) -> StdResult<(), SendError<()>> { if self.sender.is_full() { self.writer_contentious_count.set(self.writer_contentious_count.get() + 1); } @@ -260,7 +286,7 @@ impl MergerSender { self.merger_contentious_count.set(self.merger_contentious_count.get() + 1); } self.send_count.set(self.send_count.get() + 1); - match self.sender.send(op) { + match self.sender.send(WriterOperation::DbOperation(op)) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -275,7 +301,7 @@ impl MainSender<'_> { WORDS_FST_KEY.as_bytes(), value, )); - match self.0.send(WriterOperation { database: Database::Main, entry }) { + match self.0.send_db_operation(DbOperation { database: Database::Main, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -286,7 +312,7 @@ impl MainSender<'_> { WORDS_PREFIXES_FST_KEY.as_bytes(), value, )); - match self.0.send(WriterOperation { database: Database::Main, entry }) { + match self.0.send_db_operation(DbOperation { database: Database::Main, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -294,7 +320,7 @@ impl MainSender<'_> { pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.0.send(WriterOperation { database: Database::Main, entry }) { + match self.0.send_db_operation(DbOperation { database: Database::Main, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -396,7 +422,7 @@ pub struct WordDocidsSender<'a, D> { impl DocidsSender for WordDocidsSender<'_, D> { fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); - match self.sender.send(WriterOperation { database: D::DATABASE, entry }) { + match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -404,7 +430,7 @@ impl DocidsSender for WordDocidsSender<'_, D> { fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send(WriterOperation { database: D::DATABASE, entry }) { + match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -429,7 +455,7 @@ impl DocidsSender for FacetDocidsSender<'_> { } _ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)), }; - match self.sender.send(WriterOperation { database, entry }) { + match self.sender.send_db_operation(DbOperation { database, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -439,7 +465,7 @@ impl DocidsSender for FacetDocidsSender<'_> { let (facet_kind, key) = FacetKind::extract_from_key(key); let database = Database::from(facet_kind); let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send(WriterOperation { database, entry }) { + match self.sender.send_db_operation(DbOperation { database, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -460,7 +486,7 @@ impl DocumentsSender<'_> { &docid.to_be_bytes(), document.as_bytes(), )); - match self.0.send(WriterOperation { database: Database::Documents, entry }) { + match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), }?; @@ -469,7 +495,10 @@ impl DocumentsSender<'_> { external_id.as_bytes(), &docid.to_be_bytes(), )); - match self.0.send(WriterOperation { database: Database::ExternalDocumentsIds, entry }) { + match self + .0 + .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry }) + { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -477,33 +506,38 @@ impl DocumentsSender<'_> { pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes())); - match self.0.send(WriterOperation { database: Database::Documents, entry }) { + match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), }?; let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes())); - match self.0.send(WriterOperation { database: Database::ExternalDocumentsIds, entry }) { + match self + .0 + .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry }) + { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } } } -pub struct EmbeddingSender<'a>(Option<&'a Sender>); +pub struct EmbeddingSender<'a>(&'a Sender); impl EmbeddingSender<'_> { - pub fn delete(&self, docid: DocumentId, embedder_id: u8) -> StdResult<(), SendError<()>> { - todo!() - } - pub fn set_vectors( &self, docid: DocumentId, embedder_id: u8, embeddings: Vec, ) -> StdResult<(), SendError<()>> { - todo!() + self.0 + .send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors { + docid, + embedder_id, + embeddings, + })) + .map_err(|_| SendError(())) } pub fn set_vector( @@ -512,19 +546,24 @@ impl EmbeddingSender<'_> { embedder_id: u8, embedding: Embedding, ) -> StdResult<(), SendError<()>> { - todo!() + self.0 + .send(WriterOperation::ArroyOperation(ArroyOperation::SetVector { + docid, + embedder_id, + embedding, + })) + .map_err(|_| SendError(())) } - pub fn set_user_provided( - &self, - docid: DocumentId, - embedder_id: u8, - regenerate: bool, + /// Marks all embedders as "to be built" + pub fn finish( + self, + user_provided: HashMap, ) -> StdResult<(), SendError<()>> { - todo!() + self.0 + .send(WriterOperation::ArroyOperation(ArroyOperation::Finish { user_provided })) + .map_err(|_| SendError(())) } - - pub fn finish(self, embedder_id: u8) {} } pub enum MergerOperation { ExactWordDocidsMerger(Merger), diff --git a/milli/src/update/new/document.rs b/milli/src/update/new/document.rs index 0a5172d36..068268c4e 100644 --- a/milli/src/update/new/document.rs +++ b/milli/src/update/new/document.rs @@ -4,7 +4,7 @@ use heed::RoTxn; use raw_collections::RawMap; use serde_json::value::RawValue; -use super::vector_document::{VectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions}; +use super::vector_document::VectorDocument; use super::{KvReaderFieldId, KvWriterFieldId}; use crate::documents::FieldIdMapper; use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 2fbe427f3..cbb42af8b 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -267,7 +267,7 @@ impl Stats { } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct DelAddRoaringBitmap { pub(crate) del: Option, pub(crate) add: Option, diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 5a63dccfa..8a18eb074 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -11,6 +11,7 @@ use bumpalo::Bump; pub use faceted::*; use grenad::Merger; pub use searchable::*; +pub use vectors::EmbeddingExtractor; use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal}; use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}; diff --git a/milli/src/update/new/extract/vectors/mod.rs b/milli/src/update/new/extract/vectors/mod.rs index a2762ae7a..96b03a25b 100644 --- a/milli/src/update/new/extract/vectors/mod.rs +++ b/milli/src/update/new/extract/vectors/mod.rs @@ -1,3 +1,10 @@ +use std::cell::RefCell; + +use bumpalo::collections::Vec as BVec; +use bumpalo::Bump; +use hashbrown::HashMap; + +use super::cache::DelAddRoaringBitmap; use crate::error::FaultSource; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; @@ -5,26 +12,34 @@ use crate::update::new::indexer::document_changes::{Extractor, FullySend}; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; use crate::vector::error::EmbedErrorKind; -use crate::vector::Embedder; -use crate::{DocumentId, Result, ThreadPoolNoAbort, UserError}; +use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; +use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, UserError}; pub struct EmbeddingExtractor<'a> { - embedder: &'a Embedder, - prompt: &'a Prompt, - embedder_id: u8, - embedder_name: &'a str, + embedders: &'a EmbeddingConfigs, sender: &'a EmbeddingSender<'a>, threads: &'a ThreadPoolNoAbort, } +impl<'a> EmbeddingExtractor<'a> { + pub fn new( + embedders: &'a EmbeddingConfigs, + sender: &'a EmbeddingSender<'a>, + threads: &'a ThreadPoolNoAbort, + ) -> Self { + Self { embedders, sender, threads } + } +} + impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { - type Data = FullySend<()>; + type Data = FullySend>>; fn init_data<'doc>( &'doc self, _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, ) -> crate::Result { - Ok(FullySend(())) + /// TODO: use the extractor_alloc in the hashbrown once you merge the branch where it is no longer a RefBump + Ok(FullySend(Default::default())) } fn process<'doc>( @@ -34,63 +49,90 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { Self::Data, >, ) -> crate::Result<()> { - let embedder_name: &str = self.embedder_name; - let embedder: &Embedder = self.embedder; - let prompt: &Prompt = self.prompt; + let embedders = self.embedders.inner_as_ref(); - let mut chunks = Chunks::new( - embedder, - self.embedder_id, - embedder_name, - self.threads, - self.sender, - &context.doc_alloc, - ); + let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); + for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { + let embedder_id = + context.index.embedder_category_id.get(&context.txn, embedder_name)?.ok_or_else( + || InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + }, + )?; + all_chunks.push(Chunks::new( + embedder, + embedder_id, + embedder_name, + prompt, + &context.data.0, + self.threads, + self.sender, + &context.doc_alloc, + )) + } for change in changes { let change = change?; match change { - DocumentChange::Deletion(deletion) => { - self.sender.delete(deletion.docid(), self.embedder_id).unwrap(); + DocumentChange::Deletion(_deletion) => { + // handled by document sender } DocumentChange::Update(update) => { - /// FIXME: this will force the parsing/retrieval of VectorDocument once per embedder - /// consider doing all embedders at once? let old_vectors = update.current_vectors( &context.txn, context.index, context.db_fields_ids_map, &context.doc_alloc, )?; - let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap(); let new_vectors = update.updated_vectors(&context.doc_alloc)?; - if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { - new_vectors.vectors_for_key(embedder_name).transpose() - }) { - let new_vectors = new_vectors?; - match (old_vectors.regenerate, new_vectors.regenerate) { - (true, true) | (false, false) => todo!(), - _ => { - self.sender - .set_user_provided( - update.docid(), - self.embedder_id, - !new_vectors.regenerate, - ) - .unwrap(); + + for chunks in &mut all_chunks { + let embedder_name = chunks.embedder_name(); + let prompt = chunks.prompt(); + + let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap(); + if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { + new_vectors.vectors_for_key(embedder_name).transpose() + }) { + let new_vectors = new_vectors?; + match (old_vectors.regenerate, new_vectors.regenerate) { + (true, true) | (false, false) => todo!(), + _ => { + chunks.set_regenerate(update.docid(), new_vectors.regenerate); + } } - } - // do we have set embeddings? - if let Some(embeddings) = new_vectors.embeddings { - self.sender - .set_vectors( + // do we have set embeddings? + if let Some(embeddings) = new_vectors.embeddings { + chunks.set_vectors( update.docid(), - self.embedder_id, embeddings.into_vec().map_err(UserError::SerdeJson)?, - ) - .unwrap(); - } else if new_vectors.regenerate { - let new_rendered = prompt.render_document( + ); + } else if new_vectors.regenerate { + let new_rendered = prompt.render_document( + update.current( + &context.txn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + )?; + let old_rendered = prompt.render_document( + update.merged( + &context.txn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + )?; + if new_rendered != old_rendered { + chunks.set_autogenerated(update.docid(), new_rendered)?; + } + } + } else if old_vectors.regenerate { + let old_rendered = prompt.render_document( update.current( &context.txn, context.index, @@ -99,7 +141,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { context.new_fields_ids_map, &context.doc_alloc, )?; - let old_rendered = prompt.render_document( + let new_rendered = prompt.render_document( update.merged( &context.txn, context.index, @@ -109,82 +151,55 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { &context.doc_alloc, )?; if new_rendered != old_rendered { - chunks.push(update.docid(), new_rendered)?; + chunks.set_autogenerated(update.docid(), new_rendered)?; } } - } else if old_vectors.regenerate { - let old_rendered = prompt.render_document( - update.current( - &context.txn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - let new_rendered = prompt.render_document( - update.merged( - &context.txn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - if new_rendered != old_rendered { - chunks.push(update.docid(), new_rendered)?; - } } } DocumentChange::Insertion(insertion) => { - // if no inserted vectors, then regenerate: true + no embeddings => autogenerate - let new_vectors = insertion.inserted_vectors(&context.doc_alloc)?; - if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { - new_vectors.vectors_for_key(embedder_name).transpose() - }) { - let new_vectors = new_vectors?; - self.sender - .set_user_provided( - insertion.docid(), - self.embedder_id, - !new_vectors.regenerate, - ) - .unwrap(); - if let Some(embeddings) = new_vectors.embeddings { - self.sender - .set_vectors( + for chunks in &mut all_chunks { + let embedder_name = chunks.embedder_name(); + let prompt = chunks.prompt(); + // if no inserted vectors, then regenerate: true + no embeddings => autogenerate + let new_vectors = insertion.inserted_vectors(&context.doc_alloc)?; + if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { + new_vectors.vectors_for_key(embedder_name).transpose() + }) { + let new_vectors = new_vectors?; + chunks.set_regenerate(insertion.docid(), new_vectors.regenerate); + if let Some(embeddings) = new_vectors.embeddings { + chunks.set_vectors( insertion.docid(), - self.embedder_id, embeddings.into_vec().map_err(UserError::SerdeJson)?, - ) - .unwrap(); - } else if new_vectors.regenerate { + ); + } else if new_vectors.regenerate { + let rendered = prompt.render_document( + insertion.inserted(), + context.new_fields_ids_map, + &context.doc_alloc, + )?; + chunks.set_autogenerated(insertion.docid(), rendered)?; + } + } else { let rendered = prompt.render_document( insertion.inserted(), context.new_fields_ids_map, &context.doc_alloc, )?; - chunks.push(insertion.docid(), rendered)?; + chunks.set_autogenerated(insertion.docid(), rendered)?; } - } else { - let rendered = prompt.render_document( - insertion.inserted(), - context.new_fields_ids_map, - &context.doc_alloc, - )?; - chunks.push(insertion.docid(), rendered)?; } } } } - chunks.drain() + for chunk in all_chunks { + chunk.drain()?; + } + Ok(()) } } -use bumpalo::collections::Vec as BVec; -use bumpalo::Bump; - // **Warning**: the destructor of this struct is not normally run, make sure that all its fields: // 1. don't have side effects tied to they destructors // 2. if allocated, are allocated inside of the bumpalo @@ -199,15 +214,21 @@ struct Chunks<'a> { embedder: &'a Embedder, embedder_id: u8, embedder_name: &'a str, + prompt: &'a Prompt, + + user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, sender: &'a EmbeddingSender<'a>, } impl<'a> Chunks<'a> { + #[allow(clippy::too_many_arguments)] pub fn new( embedder: &'a Embedder, embedder_id: u8, embedder_name: &'a str, + prompt: &'a Prompt, + user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, sender: &'a EmbeddingSender<'a>, doc_alloc: &'a Bump, @@ -215,10 +236,20 @@ impl<'a> Chunks<'a> { let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); let texts = BVec::with_capacity_in(capacity, doc_alloc); let ids = BVec::with_capacity_in(capacity, doc_alloc); - Self { texts, ids, embedder, threads, sender, embedder_id, embedder_name } + Self { + texts, + ids, + embedder, + prompt, + threads, + sender, + embedder_id, + embedder_name, + user_provided, + } } - pub fn push(&mut self, docid: DocumentId, rendered: &'a str) -> Result<()> { + pub fn set_autogenerated(&mut self, docid: DocumentId, rendered: &'a str) -> Result<()> { if self.texts.len() < self.texts.capacity() { self.texts.push(rendered); self.ids.push(docid); @@ -316,4 +347,28 @@ impl<'a> Chunks<'a> { ids.clear(); res } + + pub fn prompt(&self) -> &'a Prompt { + self.prompt + } + + pub fn embedder_name(&self) -> &'a str { + self.embedder_name + } + + fn set_regenerate(&self, docid: DocumentId, regenerate: bool) { + let mut user_provided = self.user_provided.borrow_mut(); + let user_provided = + user_provided.entry_ref(self.embedder_name).or_insert(Default::default()); + if regenerate { + // regenerate == !user_provided + user_provided.del.get_or_insert(Default::default()).insert(docid); + } else { + user_provided.add.get_or_insert(Default::default()).insert(docid); + } + } + + fn set_vectors(&self, docid: DocumentId, embeddings: Vec) { + self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap(); + } } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index b316cbc34..d0be88e34 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -1,5 +1,5 @@ use std::cell::RefCell; -use std::sync::RwLock; +use std::sync::{OnceLock, RwLock}; use std::thread::{self, Builder}; use big_s::S; @@ -10,9 +10,13 @@ use document_changes::{ }; pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; +use hashbrown::HashMap; use heed::{RoTxn, RwTxn}; +use itertools::{EitherOrBoth, Itertools}; pub use partial_dump::PartialDump; +use rand::SeedableRng as _; use rayon::ThreadPool; +use roaring::RoaringBitmap; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; @@ -31,10 +35,15 @@ use crate::facet::FacetType; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::proximity::ProximityPrecision; use crate::update::new::channel::ExtractorSender; +use crate::update::new::extract::EmbeddingExtractor; use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +use crate::vector::{ArroyWrapper, EmbeddingConfigs}; +use crate::{ + FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, + ThreadPoolNoAbortBuilder, UserError, +}; pub(crate) mod de; pub mod document_changes; @@ -119,6 +128,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { /// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. /// /// TODO return stats +#[allow(clippy::too_many_arguments)] // clippy: 😝 pub fn index<'pl, 'indexer, 'index, DC>( wtxn: &mut RwTxn, index: &'index Index, @@ -127,6 +137,7 @@ pub fn index<'pl, 'indexer, 'index, DC>( new_primary_key: Option>, pool: &ThreadPool, document_changes: &DC, + embedders: EmbeddingConfigs, ) -> Result<()> where DC: DocumentChanges<'pl>, @@ -153,8 +164,9 @@ where fields_ids_map_store: &fields_ids_map_store, }; - thread::scope(|s| { + thread::scope(|s| -> Result<()> { let indexer_span = tracing::Span::current(); + let embedders = &embedders; // TODO manage the errors correctly let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { @@ -238,9 +250,29 @@ where if index_embeddings.is_empty() { break 'vectors; } - for index_embedding in index_embeddings { + /// FIXME: need access to `merger_sender` + let embedding_sender = todo!(); + let extractor = EmbeddingExtractor::new(&embedders, &embedding_sender, request_threads()); + let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); + for_each_document_change(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore)?; + + + let mut user_provided = HashMap::new(); + for data in datastore { + let data = data.0.into_inner(); + for (embedder, deladd) in data.into_iter() { + let user_provided = user_provided.entry(embedder).or_insert(Default::default()); + if let Some(del) = deladd.del { + *user_provided -= del; + } + if let Some(add) = deladd.add { + *user_provided |= add; + } + } } + + embedding_sender.finish(user_provided).unwrap(); } { @@ -285,15 +317,137 @@ where ) })?; + let vector_arroy = index.vector_arroy; + let mut rng = rand::rngs::StdRng::seed_from_u64(42); + let indexer_span = tracing::Span::current(); + let arroy_writers: Result> = embedders + .inner_as_ref() + .iter() + .map(|(embedder_name, (embedder, _, was_quantized))| { + let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + }, + )?; + + let dimensions = embedder.dimensions(); + + let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index) + .map(|k| ArroyWrapper::new(vector_arroy, k, *was_quantized)) + .collect(); + + Ok(( + embedder_index, + (embedder_name.as_str(), embedder.as_ref(), writers, dimensions), + )) + }) + .collect(); + + let mut arroy_writers = arroy_writers?; for operation in writer_receiver { - let database = operation.database(index); - match operation.entry() { - EntryOperation::Delete(e) => { - if !database.delete(wtxn, e.entry())? { - unreachable!("We tried to delete an unknown key") + match operation { + WriterOperation::DbOperation(db_operation) => { + let database = db_operation.database(index); + match db_operation.entry() { + EntryOperation::Delete(e) => { + if !database.delete(wtxn, e.entry())? { + unreachable!("We tried to delete an unknown key") + } + } + EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, } } - EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, + WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { + ArroyOperation::DeleteVectors { docid } => { + for (_embedder_index, (_embedder_name, _embedder, writers, dimensions)) in + &mut arroy_writers + { + let dimensions = *dimensions; + for writer in writers { + // Uses invariant: vectors are packed in the first writers. + if !writer.del_item(wtxn, dimensions, docid)? { + break; + } + } + } + } + ArroyOperation::SetVectors { docid, embedder_id, embeddings } => { + let (_, _, writers, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + for res in writers.iter().zip_longest(&embeddings) { + match res { + EitherOrBoth::Both(writer, embedding) => { + writer.add_item(wtxn, *dimensions, docid, embedding)?; + } + EitherOrBoth::Left(writer) => { + let deleted = writer.del_item(wtxn, *dimensions, docid)?; + if !deleted { + break; + } + } + EitherOrBoth::Right(_embedding) => { + let external_document_id = index + .external_id_of(wtxn, std::iter::once(docid))? + .into_iter() + .next() + .unwrap()?; + return Err(UserError::TooManyVectors( + external_document_id, + embeddings.len(), + ) + .into()); + } + } + } + } + ArroyOperation::SetVector { docid, embedder_id, embedding } => { + let (_, _, writers, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + for res in writers.iter().zip_longest(std::iter::once(&embedding)) { + match res { + EitherOrBoth::Both(writer, embedding) => { + writer.add_item(wtxn, *dimensions, docid, embedding)?; + } + EitherOrBoth::Left(writer) => { + let deleted = writer.del_item(wtxn, *dimensions, docid)?; + if !deleted { + break; + } + } + EitherOrBoth::Right(_embedding) => { + unreachable!("1 vs 256 vectors") + } + } + } + } + ArroyOperation::Finish { mut user_provided } => { + let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); + let _entered = span.enter(); + for (_embedder_index, (_embedder_name, _embedder, writers, dimensions)) in + &mut arroy_writers + { + let dimensions = *dimensions; + for writer in writers { + if writer.need_build(wtxn, dimensions)? { + writer.build(wtxn, &mut rng, dimensions)?; + } else if writer.is_empty(wtxn, dimensions)? { + break; + } + } + } + + let mut configs = index.embedding_configs(wtxn)?; + + for config in &mut configs { + if let Some(user_provided) = user_provided.remove(&config.name) { + config.user_provided = user_provided; + } + } + + index.put_embedding_configs(wtxn, configs)?; + } + }, } } @@ -483,3 +637,15 @@ pub fn retrieve_or_guess_primary_key<'a>( Err(err) => Ok(Err(err)), } } + +fn request_threads() -> &'static ThreadPoolNoAbort { + static REQUEST_THREADS: OnceLock = OnceLock::new(); + + REQUEST_THREADS.get_or_init(|| { + ThreadPoolNoAbortBuilder::new() + .num_threads(crate::vector::REQUEST_PARALLELISM) + .thread_name(|index| format!("embedding-request-{index}")) + .build() + .unwrap() + }) +} diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 6183beb63..14e947686 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -149,6 +149,7 @@ pub fn merge_grenad_entries( } } MergerOperation::DeleteDocument { docid, external_id } => { + /// TODO: delete vectors let span = tracing::trace_span!(target: "indexing::documents::merge", "delete_document"); let _entered = span.enter(); From 4ebedf4dc82eb08c2cf3e20cb077d4238b71d929 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 30 Oct 2024 10:06:38 +0100 Subject: [PATCH 179/247] clippy fixes --- milli/src/update/new/extract/vectors/mod.rs | 3 +-- milli/src/update/new/indexer/de.rs | 2 +- milli/src/update/new/indexer/document_operation.rs | 2 +- milli/src/update/new/indexer/mod.rs | 7 +++---- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/milli/src/update/new/extract/vectors/mod.rs b/milli/src/update/new/extract/vectors/mod.rs index 96b03a25b..facec10f6 100644 --- a/milli/src/update/new/extract/vectors/mod.rs +++ b/milli/src/update/new/extract/vectors/mod.rs @@ -358,8 +358,7 @@ impl<'a> Chunks<'a> { fn set_regenerate(&self, docid: DocumentId, regenerate: bool) { let mut user_provided = self.user_provided.borrow_mut(); - let user_provided = - user_provided.entry_ref(self.embedder_name).or_insert(Default::default()); + let user_provided = user_provided.entry_ref(self.embedder_name).or_default(); if regenerate { // regenerate == !user_provided user_provided.del.get_or_insert(Default::default()).insert(docid); diff --git a/milli/src/update/new/indexer/de.rs b/milli/src/update/new/indexer/de.rs index fa6b5fa76..3da4fc239 100644 --- a/milli/src/update/new/indexer/de.rs +++ b/milli/src/update/new/indexer/de.rs @@ -49,7 +49,7 @@ impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> Visitor<'de> visitor: MutFieldIdMapVisitor(self.fields_ids_map), })? { - let Some(fid) = fid else { + let Some(_fid) = fid else { return Ok(Err(crate::UserError::AttributeLimitReached)); }; self.fields_ids_map = fields_ids_map; diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index bc1634d75..9ba74c69e 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -9,7 +9,7 @@ use IndexDocumentsMethod as Idm; use super::super::document_change::DocumentChange; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; use crate::documents::PrimaryKey; -use crate::update::new::document::{DocumentFromVersions, Versions}; +use crate::update::new::document::Versions; use crate::update::new::{Deletion, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index d0be88e34..7fba0ffa4 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -16,12 +16,11 @@ use itertools::{EitherOrBoth, Itertools}; pub use partial_dump::PartialDump; use rand::SeedableRng as _; use rayon::ThreadPool; -use roaring::RoaringBitmap; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; use super::channel::*; -use super::document::write_to_obkv; +use super::document::{write_to_obkv, Document}; use super::document_change::DocumentChange; use super::extract::*; use super::merger::{merge_grenad_entries, FacetFieldIdsDelta}; @@ -252,7 +251,7 @@ where } /// FIXME: need access to `merger_sender` let embedding_sender = todo!(); - let extractor = EmbeddingExtractor::new(&embedders, &embedding_sender, request_threads()); + let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, request_threads()); let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); for_each_document_change(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore)?; @@ -324,7 +323,7 @@ where .inner_as_ref() .iter() .map(|(embedder_name, (embedder, _, was_quantized))| { - let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( + let embedder_index = index.embedder_category_id.get(wtxn, embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None, From 0f6a1dbce7813f245cd0e7ed16b28c3a4ead5c23 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 30 Oct 2024 10:06:46 +0100 Subject: [PATCH 180/247] habemus field distribution --- milli/src/update/new/indexer/mod.rs | 51 +++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 7fba0ffa4..dd0ff781d 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -56,13 +56,13 @@ struct DocumentExtractor<'a> { } impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { - type Data = FullySend<()>; + type Data = FullySend>>; fn init_data( &self, _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, ) -> Result { - Ok(FullySend(())) + Ok(FullySend(Default::default())) } fn process<'doc>( @@ -71,6 +71,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { context: &DocumentChangeContext, ) -> Result<()> { let mut document_buffer = Vec::new(); + let mut field_distribution_delta = context.data.0.borrow_mut(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); @@ -82,10 +83,34 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { match change { DocumentChange::Deletion(deletion) => { let docid = deletion.docid(); + let content = deletion.current( + &context.txn, + context.index, + &context.db_fields_ids_map, + )?; + for res in content.iter_top_level_fields() { + let (f, _) = res?; + let entry = field_distribution_delta.entry_ref(f).or_default(); + *entry -= 1; + } self.document_sender.delete(docid, external_docid).unwrap(); } DocumentChange::Update(update) => { let docid = update.docid(); + let content = + update.current(&context.txn, context.index, &context.db_fields_ids_map)?; + for res in content.iter_top_level_fields() { + let (f, _) = res?; + let entry = field_distribution_delta.entry_ref(f).or_default(); + *entry -= 1; + } + let content = update.updated(); + for res in content.iter_top_level_fields() { + let (f, _) = res?; + let entry = field_distribution_delta.entry_ref(f).or_default(); + *entry += 1; + } + let content = update.merged(&context.txn, context.index, &context.db_fields_ids_map)?; let vector_content = update.merged_vectors( @@ -105,6 +130,11 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { DocumentChange::Insertion(insertion) => { let docid = insertion.docid(); let content = insertion.inserted(); + for res in content.iter_top_level_fields() { + let (f, _) = res?; + let entry = field_distribution_delta.entry_ref(f).or_default(); + *entry += 1; + } let inserted_vectors = insertion.inserted_vectors(&context.doc_alloc)?; let content = write_to_obkv( &content, @@ -163,9 +193,13 @@ where fields_ids_map_store: &fields_ids_map_store, }; + let mut field_distribution = index.field_distribution(wtxn)?; + thread::scope(|s| -> Result<()> { let indexer_span = tracing::Span::current(); let embedders = &embedders; + // prevent moving the field_distribution in the inner closure... + let field_distribution = &mut field_distribution; // TODO manage the errors correctly let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { @@ -178,6 +212,17 @@ where let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?; + for field_distribution_delta in datastore { + let field_distribution_delta = field_distribution_delta.0.into_inner(); + for (field, delta) in field_distribution_delta { + let current = field_distribution.entry(field).or_default(); + // adding the delta should never cause a negative result, as we are removing fields that previously existed. + *current = current.saturating_add_signed(delta); + } + } + + field_distribution.retain(|_, v| *v == 0); + document_sender.finish().unwrap(); const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; @@ -479,7 +524,7 @@ where let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn)?; inner_index_settings.recompute_facets(wtxn, index)?; inner_index_settings.recompute_searchables(wtxn, index)?; - + index.put_field_distribution(wtxn, &field_distribution)?; index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; Ok(()) From df5bc3c9fdccefe76fedc73d18da6ff2354910a4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 30 Oct 2024 10:55:57 +0100 Subject: [PATCH 181/247] Reintroduce vector errors --- milli/src/update/new/extract/vectors/mod.rs | 108 ++++++++++++++------ milli/src/update/new/indexer/mod.rs | 2 +- milli/src/vector/error.rs | 39 +++++++ 3 files changed, 117 insertions(+), 32 deletions(-) diff --git a/milli/src/update/new/extract/vectors/mod.rs b/milli/src/update/new/extract/vectors/mod.rs index facec10f6..92c355710 100644 --- a/milli/src/update/new/extract/vectors/mod.rs +++ b/milli/src/update/new/extract/vectors/mod.rs @@ -11,13 +11,16 @@ use crate::update::new::channel::EmbeddingSender; use crate::update::new::indexer::document_changes::{Extractor, FullySend}; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; -use crate::vector::error::EmbedErrorKind; +use crate::vector::error::{ + EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistributionBump, +}; use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; -use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, UserError}; +use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; pub struct EmbeddingExtractor<'a> { embedders: &'a EmbeddingConfigs, sender: &'a EmbeddingSender<'a>, + possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, } @@ -25,9 +28,11 @@ impl<'a> EmbeddingExtractor<'a> { pub fn new( embedders: &'a EmbeddingConfigs, sender: &'a EmbeddingSender<'a>, + field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, ) -> Self { - Self { embedders, sender, threads } + let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution); + Self { embedders, sender, threads, possible_embedding_mistakes } } } @@ -50,6 +55,8 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { >, ) -> crate::Result<()> { let embedders = self.embedders.inner_as_ref(); + let mut unused_vectors_distribution = + UnusedVectorsDistributionBump::new_in(&context.doc_alloc); let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { @@ -66,6 +73,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { embedder_name, prompt, &context.data.0, + &self.possible_embedding_mistakes, self.threads, self.sender, &context.doc_alloc, @@ -87,6 +95,10 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { )?; let new_vectors = update.updated_vectors(&context.doc_alloc)?; + if let Some(new_vectors) = &new_vectors { + unused_vectors_distribution.append(new_vectors); + } + for chunks in &mut all_chunks { let embedder_name = chunks.embedder_name(); let prompt = chunks.prompt(); @@ -128,7 +140,11 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { &context.doc_alloc, )?; if new_rendered != old_rendered { - chunks.set_autogenerated(update.docid(), new_rendered)?; + chunks.set_autogenerated( + update.docid(), + new_rendered, + &unused_vectors_distribution, + )?; } } } else if old_vectors.regenerate { @@ -151,17 +167,25 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { &context.doc_alloc, )?; if new_rendered != old_rendered { - chunks.set_autogenerated(update.docid(), new_rendered)?; + chunks.set_autogenerated( + update.docid(), + new_rendered, + &unused_vectors_distribution, + )?; } } } } DocumentChange::Insertion(insertion) => { + let new_vectors = insertion.inserted_vectors(&context.doc_alloc)?; + if let Some(new_vectors) = &new_vectors { + unused_vectors_distribution.append(new_vectors); + } + for chunks in &mut all_chunks { let embedder_name = chunks.embedder_name(); let prompt = chunks.prompt(); // if no inserted vectors, then regenerate: true + no embeddings => autogenerate - let new_vectors = insertion.inserted_vectors(&context.doc_alloc)?; if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { new_vectors.vectors_for_key(embedder_name).transpose() }) { @@ -178,7 +202,11 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { context.new_fields_ids_map, &context.doc_alloc, )?; - chunks.set_autogenerated(insertion.docid(), rendered)?; + chunks.set_autogenerated( + insertion.docid(), + rendered, + &unused_vectors_distribution, + )?; } } else { let rendered = prompt.render_document( @@ -186,7 +214,11 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { context.new_fields_ids_map, &context.doc_alloc, )?; - chunks.set_autogenerated(insertion.docid(), rendered)?; + chunks.set_autogenerated( + insertion.docid(), + rendered, + &unused_vectors_distribution, + )?; } } } @@ -194,7 +226,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { } for chunk in all_chunks { - chunk.drain()?; + chunk.drain(&unused_vectors_distribution)?; } Ok(()) } @@ -215,7 +247,7 @@ struct Chunks<'a> { embedder_id: u8, embedder_name: &'a str, prompt: &'a Prompt, - + possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, sender: &'a EmbeddingSender<'a>, @@ -229,6 +261,7 @@ impl<'a> Chunks<'a> { embedder_name: &'a str, prompt: &'a Prompt, user_provided: &'a RefCell>, + possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, sender: &'a EmbeddingSender<'a>, doc_alloc: &'a Bump, @@ -241,6 +274,7 @@ impl<'a> Chunks<'a> { ids, embedder, prompt, + possible_embedding_mistakes, threads, sender, embedder_id, @@ -249,7 +283,12 @@ impl<'a> Chunks<'a> { } } - pub fn set_autogenerated(&mut self, docid: DocumentId, rendered: &'a str) -> Result<()> { + pub fn set_autogenerated( + &mut self, + docid: DocumentId, + rendered: &'a str, + unused_vectors_distribution: &UnusedVectorsDistributionBump, + ) -> Result<()> { if self.texts.len() < self.texts.capacity() { self.texts.push(rendered); self.ids.push(docid); @@ -262,18 +301,25 @@ impl<'a> Chunks<'a> { self.embedder, self.embedder_id, self.embedder_name, + self.possible_embedding_mistakes, + unused_vectors_distribution, self.threads, self.sender, ) } - pub fn drain(mut self) -> Result<()> { + pub fn drain( + mut self, + unused_vectors_distribution: &UnusedVectorsDistributionBump, + ) -> Result<()> { let res = Self::embed_chunks( &mut self.texts, &mut self.ids, self.embedder, self.embedder_id, self.embedder_name, + self.possible_embedding_mistakes, + unused_vectors_distribution, self.threads, self.sender, ); @@ -285,11 +331,13 @@ impl<'a> Chunks<'a> { pub fn embed_chunks( texts: &mut BVec<'a, &'a str>, ids: &mut BVec<'a, DocumentId>, - embedder: &'a Embedder, + embedder: &Embedder, embedder_id: u8, embedder_name: &str, - threads: &'a ThreadPoolNoAbort, - sender: &'a EmbeddingSender<'a>, + possible_embedding_mistakes: &PossibleEmbeddingMistakes, + unused_vectors_distribution: &UnusedVectorsDistributionBump, + threads: &ThreadPoolNoAbort, + sender: &EmbeddingSender<'a>, ) -> Result<()> { let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) { Ok(embeddings) => { @@ -312,25 +360,23 @@ impl<'a> Chunks<'a> { msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); } - /// FIXME: reintroduce possible_embedding_mistakes and possible_embedding_mistakes let mut hint_count = 0; - /* - for (vector_misspelling, count) in - possible_embedding_mistakes.vector_mistakes().take(2) - { - msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); - hint_count += 1; - } + for (vector_misspelling, count) in + possible_embedding_mistakes.vector_mistakes().take(2) + { + msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); + hint_count += 1; + } + + for (embedder_misspelling, count) in possible_embedding_mistakes + .embedder_mistakes_bump(embedder_name, unused_vectors_distribution) + .take(2) + { + msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); + hint_count += 1; + } - for (embedder_misspelling, count) in possible_embedding_mistakes - .embedder_mistakes(embedder_name, unused_vectors_distribution) - .take(2) - { - msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); - hint_count += 1; - } - */ if hint_count == 0 { if let EmbedErrorKind::ManualEmbed(_) = &error.kind { msg += &format!( diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index dd0ff781d..f6a50ef1c 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -296,7 +296,7 @@ where } /// FIXME: need access to `merger_sender` let embedding_sender = todo!(); - let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, request_threads()); + let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, &field_distribution, request_threads()); let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); for_each_document_change(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore)?; diff --git a/milli/src/vector/error.rs b/milli/src/vector/error.rs index 3c8cb4b06..d5e0697d6 100644 --- a/milli/src/vector/error.rs +++ b/milli/src/vector/error.rs @@ -1,11 +1,13 @@ use std::collections::BTreeMap; use std::path::PathBuf; +use bumpalo::Bump; use hf_hub::api::sync::ApiError; use super::parsed_vectors::ParsedVectorsDiff; use super::rest::ConfigurationSource; use crate::error::FaultSource; +use crate::update::new::vector_document::VectorDocument; use crate::{FieldDistribution, PanicCatched}; #[derive(Debug, thiserror::Error)] @@ -417,6 +419,23 @@ impl PossibleEmbeddingMistakes { } }) } + + pub fn embedder_mistakes_bump<'a, 'doc: 'a>( + &'a self, + embedder_name: &'a str, + unused_vectors_distribution: &'a UnusedVectorsDistributionBump<'doc>, + ) -> impl Iterator + 'a { + let builder = levenshtein_automata::LevenshteinAutomatonBuilder::new(2, true); + let automata = builder.build_dfa(embedder_name); + + unused_vectors_distribution.0.iter().filter_map(move |(field, count)| { + match automata.eval(field) { + levenshtein_automata::Distance::Exact(0) => None, + levenshtein_automata::Distance::Exact(_) => Some((*field, *count)), + levenshtein_automata::Distance::AtLeast(_) => None, + } + }) + } } #[derive(Default)] @@ -433,3 +452,23 @@ impl UnusedVectorsDistribution { } } } + +pub struct UnusedVectorsDistributionBump<'doc>( + hashbrown::HashMap<&'doc str, u64, hashbrown::hash_map::DefaultHashBuilder, &'doc Bump>, +); + +impl<'doc> UnusedVectorsDistributionBump<'doc> { + pub fn new_in(doc_alloc: &'doc Bump) -> Self { + Self(hashbrown::HashMap::new_in(doc_alloc)) + } + + pub fn append(&mut self, vectors: &impl VectorDocument<'doc>) -> Result<(), crate::Error> { + for res in vectors.iter_vectors() { + let (embedder_name, entry) = res?; + if !entry.has_configured_embedder { + *self.0.entry(embedder_name).or_default() += 1; + } + } + Ok(()) + } +} From c9082130c85252da6ff08eedd2efeb158a00a19f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 30 Oct 2024 13:50:51 +0100 Subject: [PATCH 182/247] support vectors or array of vectors --- milli/src/update/new/document_change.rs | 14 +- milli/src/update/new/extract/vectors/mod.rs | 21 +- milli/src/update/new/indexer/de.rs | 291 ++++++++++++++++++++ milli/src/update/new/vector_document.rs | 98 +++++-- milli/src/vector/mod.rs | 4 + milli/src/vector/parsed_vectors.rs | 1 - 6 files changed, 401 insertions(+), 28 deletions(-) diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index bb1fc9441..4a61c110d 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -6,6 +6,7 @@ use super::vector_document::{ MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, }; use crate::documents::FieldIdMapper; +use crate::vector::EmbeddingConfigs; use crate::{DocumentId, Index, Result}; pub enum DocumentChange<'doc> { @@ -94,8 +95,9 @@ impl<'doc> Insertion<'doc> { pub fn inserted_vectors( &self, doc_alloc: &'doc Bump, + embedders: &'doc EmbeddingConfigs, ) -> Result>> { - VectorDocumentFromVersions::new(&self.new, doc_alloc) + VectorDocumentFromVersions::new(&self.new, doc_alloc, embedders) } } @@ -165,8 +167,9 @@ impl<'doc> Update<'doc> { pub fn updated_vectors( &self, doc_alloc: &'doc Bump, + embedders: &'doc EmbeddingConfigs, ) -> Result>> { - VectorDocumentFromVersions::new(&self.new, doc_alloc) + VectorDocumentFromVersions::new(&self.new, doc_alloc, embedders) } pub fn merged_vectors( @@ -175,11 +178,14 @@ impl<'doc> Update<'doc> { index: &'doc Index, mapper: &'doc Mapper, doc_alloc: &'doc Bump, + embedders: &'doc EmbeddingConfigs, ) -> Result>> { if self.has_deletion { - MergedVectorDocument::without_db(&self.new, doc_alloc) + MergedVectorDocument::without_db(&self.new, doc_alloc, embedders) } else { - MergedVectorDocument::with_db(self.docid, index, rtxn, mapper, &self.new, doc_alloc) + MergedVectorDocument::with_db( + self.docid, index, rtxn, mapper, &self.new, doc_alloc, embedders, + ) } } } diff --git a/milli/src/update/new/extract/vectors/mod.rs b/milli/src/update/new/extract/vectors/mod.rs index 92c355710..70bd4d42d 100644 --- a/milli/src/update/new/extract/vectors/mod.rs +++ b/milli/src/update/new/extract/vectors/mod.rs @@ -93,7 +93,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { context.db_fields_ids_map, &context.doc_alloc, )?; - let new_vectors = update.updated_vectors(&context.doc_alloc)?; + let new_vectors = update.updated_vectors(&context.doc_alloc, self.embedders)?; if let Some(new_vectors) = &new_vectors { unused_vectors_distribution.append(new_vectors); @@ -118,7 +118,12 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( update.docid(), - embeddings.into_vec().map_err(UserError::SerdeJson)?, + embeddings + .into_vec(&context.doc_alloc, embedder_name) + .map_err(|error| UserError::InvalidVectorsEmbedderConf { + document_id: update.external_document_id().to_string(), + error, + })?, ); } else if new_vectors.regenerate { let new_rendered = prompt.render_document( @@ -177,7 +182,8 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { } } DocumentChange::Insertion(insertion) => { - let new_vectors = insertion.inserted_vectors(&context.doc_alloc)?; + let new_vectors = + insertion.inserted_vectors(&context.doc_alloc, self.embedders)?; if let Some(new_vectors) = &new_vectors { unused_vectors_distribution.append(new_vectors); } @@ -194,7 +200,14 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( insertion.docid(), - embeddings.into_vec().map_err(UserError::SerdeJson)?, + embeddings + .into_vec(&context.doc_alloc, embedder_name) + .map_err(|error| UserError::InvalidVectorsEmbedderConf { + document_id: insertion + .external_document_id() + .to_string(), + error, + })?, ); } else if new_vectors.regenerate { let rendered = prompt.render_document( diff --git a/milli/src/update/new/indexer/de.rs b/milli/src/update/new/indexer/de.rs index 3da4fc239..94ab4c2c1 100644 --- a/milli/src/update/new/indexer/de.rs +++ b/milli/src/update/new/indexer/de.rs @@ -326,3 +326,294 @@ pub fn match_component<'de, 'indexer: 'de>( } ControlFlow::Continue(()) } + +pub struct DeserrRawValue<'a> { + value: &'a RawValue, + alloc: &'a Bump, +} + +impl<'a> DeserrRawValue<'a> { + pub fn new_in(value: &'a RawValue, alloc: &'a Bump) -> Self { + Self { value, alloc } + } +} + +pub struct DeserrRawVec<'a> { + vec: raw_collections::RawVec<'a>, + alloc: &'a Bump, +} + +impl<'a> deserr::Sequence for DeserrRawVec<'a> { + type Value = DeserrRawValue<'a>; + + type Iter = DeserrRawVecIter<'a>; + + fn len(&self) -> usize { + self.vec.len() + } + + fn into_iter(self) -> Self::Iter { + DeserrRawVecIter { it: self.vec.into_iter(), alloc: self.alloc } + } +} + +pub struct DeserrRawVecIter<'a> { + it: raw_collections::vec::iter::IntoIter<'a>, + alloc: &'a Bump, +} + +impl<'a> Iterator for DeserrRawVecIter<'a> { + type Item = DeserrRawValue<'a>; + + fn next(&mut self) -> Option { + let next = self.it.next()?; + Some(DeserrRawValue { value: next, alloc: self.alloc }) + } +} + +pub struct DeserrRawMap<'a> { + map: raw_collections::RawMap<'a>, + alloc: &'a Bump, +} + +impl<'a> deserr::Map for DeserrRawMap<'a> { + type Value = DeserrRawValue<'a>; + + type Iter = DeserrRawMapIter<'a>; + + fn len(&self) -> usize { + self.map.len() + } + + fn remove(&mut self, _key: &str) -> Option { + unimplemented!() + } + + fn into_iter(self) -> Self::Iter { + DeserrRawMapIter { it: self.map.into_iter(), alloc: self.alloc } + } +} + +pub struct DeserrRawMapIter<'a> { + it: raw_collections::map::iter::IntoIter<'a>, + alloc: &'a Bump, +} + +impl<'a> Iterator for DeserrRawMapIter<'a> { + type Item = (String, DeserrRawValue<'a>); + + fn next(&mut self) -> Option { + let (name, value) = self.it.next()?; + Some((name.to_string(), DeserrRawValue { value, alloc: self.alloc })) + } +} + +impl<'a> deserr::IntoValue for DeserrRawValue<'a> { + type Sequence = DeserrRawVec<'a>; + + type Map = DeserrRawMap<'a>; + + fn kind(&self) -> deserr::ValueKind { + self.value.deserialize_any(DeserrKindVisitor).unwrap() + } + + fn into_value(self) -> deserr::Value { + self.value.deserialize_any(DeserrRawValueVisitor { alloc: self.alloc }).unwrap() + } +} + +pub struct DeserrKindVisitor; + +impl<'de> Visitor<'de> for DeserrKindVisitor { + type Value = deserr::ValueKind; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "any value") + } + + fn visit_bool(self, _v: bool) -> Result + where + E: serde::de::Error, + { + Ok(deserr::ValueKind::Boolean) + } + + fn visit_i64(self, _v: i64) -> Result + where + E: serde::de::Error, + { + Ok(deserr::ValueKind::NegativeInteger) + } + + fn visit_u64(self, _v: u64) -> Result + where + E: serde::de::Error, + { + Ok(deserr::ValueKind::Integer) + } + + fn visit_f64(self, _v: f64) -> Result + where + E: serde::de::Error, + { + Ok(deserr::ValueKind::Float) + } + + fn visit_str(self, _v: &str) -> Result + where + E: serde::de::Error, + { + Ok(deserr::ValueKind::String) + } + + fn visit_none(self) -> Result + where + E: serde::de::Error, + { + Ok(deserr::ValueKind::Null) + } + + fn visit_some(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_any(self) + } + + fn visit_unit(self) -> Result + where + E: serde::de::Error, + { + Ok(deserr::ValueKind::Null) + } + + fn visit_newtype_struct(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_any(self) + } + + fn visit_seq(self, _seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + Ok(deserr::ValueKind::Sequence) + } + + fn visit_map(self, _map: A) -> Result + where + A: serde::de::MapAccess<'de>, + { + Ok(deserr::ValueKind::Map) + } +} + +pub struct DeserrRawValueVisitor<'a> { + alloc: &'a Bump, +} + +impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> { + type Value = deserr::Value>; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "any value") + } + + fn visit_bool(self, v: bool) -> Result + where + E: serde::de::Error, + { + Ok(deserr::Value::Boolean(v)) + } + + fn visit_i64(self, v: i64) -> Result + where + E: serde::de::Error, + { + Ok(deserr::Value::NegativeInteger(v)) + } + + fn visit_u64(self, v: u64) -> Result + where + E: serde::de::Error, + { + Ok(deserr::Value::Integer(v)) + } + + fn visit_f64(self, v: f64) -> Result + where + E: serde::de::Error, + { + Ok(deserr::Value::Float(v)) + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(deserr::Value::String(v.to_string())) + } + + fn visit_string(self, v: String) -> Result + where + E: serde::de::Error, + { + Ok(deserr::Value::String(v)) + } + + fn visit_none(self) -> Result + where + E: serde::de::Error, + { + Ok(deserr::Value::Null) + } + + fn visit_some(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_any(self) + } + + fn visit_unit(self) -> Result + where + E: serde::de::Error, + { + Ok(deserr::Value::Null) + } + + fn visit_newtype_struct(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_any(self) + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let mut raw_vec = raw_collections::RawVec::new_in(&self.alloc); + while let Some(next) = seq.next_element()? { + raw_vec.push(next); + } + Ok(deserr::Value::Sequence(DeserrRawVec { vec: raw_vec, alloc: self.alloc })) + } + + fn visit_map(self, map: A) -> Result + where + A: serde::de::MapAccess<'de>, + { + let _ = map; + Err(serde::de::Error::invalid_type(serde::de::Unexpected::Map, &self)) + } + + fn visit_enum(self, data: A) -> Result + where + A: serde::de::EnumAccess<'de>, + { + let _ = data; + Err(serde::de::Error::invalid_type(serde::de::Unexpected::Enum, &self)) + } +} diff --git a/milli/src/update/new/vector_document.rs b/milli/src/update/new/vector_document.rs index a5519a025..6796134db 100644 --- a/milli/src/update/new/vector_document.rs +++ b/milli/src/update/new/vector_document.rs @@ -1,29 +1,67 @@ use std::collections::BTreeSet; use bumpalo::Bump; +use deserr::{Deserr, IntoValue}; use heed::RoTxn; use raw_collections::RawMap; use serde::Serialize; use serde_json::value::RawValue; use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions}; +use super::indexer::de::DeserrRawValue; use crate::documents::FieldIdMapper; use crate::index::IndexEmbeddingConfig; -use crate::vector::parsed_vectors::RawVectors; -use crate::vector::Embedding; +use crate::vector::parsed_vectors::{ + RawVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, +}; +use crate::vector::{Embedding, EmbeddingConfigs}; use crate::{DocumentId, Index, InternalError, Result, UserError}; #[derive(Serialize)] #[serde(untagged)] pub enum Embeddings<'doc> { - FromJson(&'doc RawValue), + FromJsonExplicit(&'doc RawValue), + FromJsonImplicityUserProvided(&'doc RawValue), FromDb(Vec), } impl<'doc> Embeddings<'doc> { - pub fn into_vec(self) -> std::result::Result, serde_json::Error> { + pub fn into_vec( + self, + doc_alloc: &'doc Bump, + embedder_name: &str, + ) -> std::result::Result, deserr::errors::JsonError> { match self { - /// FIXME: this should be a VecOrArrayOfVec - Embeddings::FromJson(value) => serde_json::from_str(value.get()), + Embeddings::FromJsonExplicit(value) => { + let vectors_ref = deserr::ValuePointerRef::Key { + key: RESERVED_VECTORS_FIELD_NAME, + prev: &deserr::ValuePointerRef::Origin, + }; + let embedders_ref = + deserr::ValuePointerRef::Key { key: embedder_name, prev: &vectors_ref }; + + let embeddings_ref = + deserr::ValuePointerRef::Key { key: "embeddings", prev: &embedders_ref }; + + let v: VectorOrArrayOfVectors = VectorOrArrayOfVectors::deserialize_from_value( + DeserrRawValue::new_in(value, doc_alloc).into_value(), + embeddings_ref, + )?; + Ok(v.into_array_of_vectors().unwrap_or_default()) + } + Embeddings::FromJsonImplicityUserProvided(value) => { + let vectors_ref = deserr::ValuePointerRef::Key { + key: RESERVED_VECTORS_FIELD_NAME, + prev: &deserr::ValuePointerRef::Origin, + }; + let embedders_ref = + deserr::ValuePointerRef::Key { key: embedder_name, prev: &vectors_ref }; + + let v: VectorOrArrayOfVectors = VectorOrArrayOfVectors::deserialize_from_value( + DeserrRawValue::new_in(value, doc_alloc).into_value(), + embedders_ref, + )?; + Ok(v.into_array_of_vectors().unwrap_or_default()) + } Embeddings::FromDb(vec) => Ok(vec), } } @@ -109,7 +147,7 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { Ok((&*config_name, entry)) }) .chain(self.vectors_field.iter().flat_map(|map| map.iter()).map(|(name, value)| { - Ok((name, entry_from_raw_value(value).map_err(InternalError::SerdeJson)?)) + Ok((name, entry_from_raw_value(value, false).map_err(InternalError::SerdeJson)?)) })) } @@ -122,7 +160,8 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { } None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) { Some(embedding_from_doc) => Some( - entry_from_raw_value(embedding_from_doc).map_err(InternalError::SerdeJson)?, + entry_from_raw_value(embedding_from_doc, false) + .map_err(InternalError::SerdeJson)?, ), None => None, }, @@ -132,26 +171,40 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { fn entry_from_raw_value( value: &RawValue, + has_configured_embedder: bool, ) -> std::result::Result, serde_json::Error> { let value: RawVectors = serde_json::from_str(value.get())?; - Ok(VectorEntry { - has_configured_embedder: false, - embeddings: value.embeddings().map(Embeddings::FromJson), - regenerate: value.must_regenerate(), + + Ok(match value { + RawVectors::Explicit(raw_explicit_vectors) => VectorEntry { + has_configured_embedder, + embeddings: raw_explicit_vectors.embeddings.map(Embeddings::FromJsonExplicit), + regenerate: raw_explicit_vectors.regenerate, + }, + RawVectors::ImplicitlyUserProvided(value) => VectorEntry { + has_configured_embedder, + embeddings: Some(Embeddings::FromJsonImplicityUserProvided(value)), + regenerate: false, + }, }) } pub struct VectorDocumentFromVersions<'doc> { vectors: RawMap<'doc>, + embedders: &'doc EmbeddingConfigs, } impl<'doc> VectorDocumentFromVersions<'doc> { - pub fn new(versions: &Versions<'doc>, bump: &'doc Bump) -> Result> { + pub fn new( + versions: &Versions<'doc>, + bump: &'doc Bump, + embedders: &'doc EmbeddingConfigs, + ) -> Result> { let document = DocumentFromVersions::new(versions); if let Some(vectors_field) = document.vectors_field()? { let vectors = RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; - Ok(Some(Self { vectors })) + Ok(Some(Self { vectors, embedders })) } else { Ok(None) } @@ -161,14 +214,16 @@ impl<'doc> VectorDocumentFromVersions<'doc> { impl<'doc> VectorDocument<'doc> for VectorDocumentFromVersions<'doc> { fn iter_vectors(&self) -> impl Iterator)>> { self.vectors.iter().map(|(embedder, vectors)| { - let vectors = entry_from_raw_value(vectors).map_err(UserError::SerdeJson)?; + let vectors = entry_from_raw_value(vectors, self.embedders.contains(embedder)) + .map_err(UserError::SerdeJson)?; Ok((embedder, vectors)) }) } fn vectors_for_key(&self, key: &str) -> Result>> { let Some(vectors) = self.vectors.get(key) else { return Ok(None) }; - let vectors = entry_from_raw_value(vectors).map_err(UserError::SerdeJson)?; + let vectors = entry_from_raw_value(vectors, self.embedders.contains(key)) + .map_err(UserError::SerdeJson)?; Ok(Some(vectors)) } } @@ -186,14 +241,19 @@ impl<'doc> MergedVectorDocument<'doc> { db_fields_ids_map: &'doc Mapper, versions: &Versions<'doc>, doc_alloc: &'doc Bump, + embedders: &'doc EmbeddingConfigs, ) -> Result> { let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?; - let new_doc = VectorDocumentFromVersions::new(versions, doc_alloc)?; + let new_doc = VectorDocumentFromVersions::new(versions, doc_alloc, embedders)?; Ok(if db.is_none() && new_doc.is_none() { None } else { Some(Self { new_doc, db }) }) } - pub fn without_db(versions: &Versions<'doc>, doc_alloc: &'doc Bump) -> Result> { - let Some(new_doc) = VectorDocumentFromVersions::new(versions, doc_alloc)? else { + pub fn without_db( + versions: &Versions<'doc>, + doc_alloc: &'doc Bump, + embedders: &'doc EmbeddingConfigs, + ) -> Result> { + let Some(new_doc) = VectorDocumentFromVersions::new(versions, doc_alloc, embedders)? else { return Ok(None); }; Ok(Some(Self { new_doc: Some(new_doc), db: None })) diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 2e9a498c0..a21e9e2ca 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -316,6 +316,10 @@ impl EmbeddingConfigs { Self(data) } + pub fn contains(&self, name: &str) -> bool { + self.0.contains_key(name) + } + /// Get an embedder configuration and template from its name. pub fn get(&self, name: &str) -> Option<(Arc, Arc, bool)> { self.0.get(name).cloned() diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 526516fef..40e823f17 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -84,7 +84,6 @@ impl<'doc> RawVectors<'doc> { RawVectors::Explicit(RawExplicitVectors { regenerate, .. }) => *regenerate, } } - pub fn embeddings(&self) -> Option<&'doc RawValue> { match self { RawVectors::ImplicitlyUserProvided(embeddings) => Some(embeddings), From a77d5ea8c18e701458d00eb041a5ea1822067355 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 30 Oct 2024 14:03:29 +0100 Subject: [PATCH 183/247] Pass embedders to documents --- milli/src/update/new/indexer/mod.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index f6a50ef1c..62ad05813 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -53,6 +53,7 @@ mod update_by_function; struct DocumentExtractor<'a> { document_sender: &'a DocumentSender<'a>, + embedders: &'a EmbeddingConfigs, } impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { @@ -118,6 +119,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { context.index, &context.db_fields_ids_map, &context.doc_alloc, + self.embedders, )?; let content = write_to_obkv( &content, @@ -135,7 +137,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { let entry = field_distribution_delta.entry_ref(f).or_default(); *entry += 1; } - let inserted_vectors = insertion.inserted_vectors(&context.doc_alloc)?; + let inserted_vectors = + insertion.inserted_vectors(&context.doc_alloc, self.embedders)?; let content = write_to_obkv( &content, inserted_vectors.as_ref(), @@ -208,7 +211,7 @@ where // document but we need to create a function that collects and compresses documents. let document_sender = extractor_sender.document_sender(); - let document_extractor = DocumentExtractor { document_sender: &document_sender}; + let document_extractor = DocumentExtractor { document_sender: &document_sender, embedders }; let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?; From 3658f57f935dd63755df5d90467de78b2d5339b5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 4 Nov 2024 15:10:40 +0100 Subject: [PATCH 184/247] Add progress --- index-scheduler/src/batch.rs | 44 ++++++ .../new/extract/faceted/extract_facets.rs | 22 ++- milli/src/update/new/extract/mod.rs | 16 +- .../extract/searchable/extract_word_docids.rs | 22 ++- .../src/update/new/extract/searchable/mod.rs | 38 +++-- .../update/new/indexer/document_changes.rs | 111 ++++++++++++- .../update/new/indexer/document_deletion.rs | 14 +- .../update/new/indexer/document_operation.rs | 4 + milli/src/update/new/indexer/mod.rs | 147 ++++++++++++++++-- milli/src/update/new/indexer/partial_dump.rs | 4 + .../update/new/indexer/update_by_function.rs | 6 +- 11 files changed, 380 insertions(+), 48 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 60393e51d..740528555 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -22,6 +22,7 @@ use std::ffi::OsStr; use std::fmt; use std::fs::{self, File}; use std::io::BufWriter; +use std::sync::atomic::{self, AtomicU16, AtomicU32}; use bumpalo::collections::CollectIn; use bumpalo::Bump; @@ -30,6 +31,7 @@ use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; +use meilisearch_types::milli::update::new::indexer::document_changes::Progress; use meilisearch_types::milli::update::new::indexer::{ self, retrieve_or_guess_primary_key, UpdateByFunction, }; @@ -1221,6 +1223,40 @@ impl IndexScheduler { ) -> Result> { let indexer_alloc = Bump::new(); + let last_finished_steps = AtomicU16::new(0); + let last_finished_documents = AtomicU32::new(0); + + let send_progress = + |Progress { finished_steps, total_steps, step_name, finished_total_documents }| { + /* + let current = rayon::current_thread_index(); + + let last_finished_steps = + last_finished_steps.fetch_max(finished_steps, atomic::Ordering::Relaxed); + + if last_finished_steps > finished_steps { + return; + } + + if let Some((finished_documents, total_documents)) = finished_total_documents { + if last_finished_steps < finished_steps { + last_finished_documents.store(finished_documents, atomic::Ordering::Relaxed); + } else { + let last_finished_documents = last_finished_documents + .fetch_max(finished_documents, atomic::Ordering::Relaxed); + if last_finished_documents > finished_documents { + return; + } + } + tracing::warn!("Progress from {current:?}: {step_name} ({finished_steps}/{total_steps}), document {finished_documents}/{total_documents}") + } else { + tracing::warn!( + "Progress from {current:?}: {step_name} ({finished_steps}/{total_steps})" + ) + } + */ + }; + match operation { IndexOperation::DocumentClear { mut tasks, .. } => { let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?; @@ -1377,6 +1413,8 @@ impl IndexScheduler { &pool, &document_changes, embedders, + &|| must_stop_processing.get(), + &send_progress, )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); @@ -1465,6 +1503,7 @@ impl IndexScheduler { let document_changes = indexer.into_changes(&primary_key)?; let embedders = index.embedding_configs(index_wtxn)?; let embedders = self.embedders(embedders)?; + let must_stop_processing = &self.must_stop_processing; indexer::index( index_wtxn, @@ -1475,6 +1514,8 @@ impl IndexScheduler { &pool, &document_changes, embedders, + &|| must_stop_processing.get(), + &send_progress, )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); @@ -1604,6 +1645,7 @@ impl IndexScheduler { let document_changes = indexer.into_changes(&indexer_alloc, primary_key); let embedders = index.embedding_configs(index_wtxn)?; let embedders = self.embedders(embedders)?; + let must_stop_processing = &self.must_stop_processing; indexer::index( index_wtxn, @@ -1614,6 +1656,8 @@ impl IndexScheduler { &pool, &document_changes, embedders, + &|| must_stop_processing.get(), + &send_progress, )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index f2cbad6ff..2d740f1a3 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -16,8 +16,8 @@ use super::FacetKind; use crate::facet::value_encoding::f64_into_bytes; use crate::update::new::extract::DocidsExtractor; use crate::update::new::indexer::document_changes::{ - for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, - IndexingContext, RefCellExt, ThreadLocal, + extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, + Progress, RefCellExt, ThreadLocal, }; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; @@ -250,12 +250,19 @@ fn truncate_str(s: &str) -> &str { impl DocidsExtractor for FacetedDocidsExtractor { #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] - fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>, MSP, SP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &mut ThreadLocal>>, - ) -> Result> { + finished_steps: u16, + total_steps: u16, + step_name: &'static str, + ) -> Result> + where + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, + { let max_memory = grenad_parameters.max_memory_by_thread(); let index = indexing_context.index; @@ -276,12 +283,15 @@ impl DocidsExtractor for FacetedDocidsExtractor { grenad_parameters, max_memory, }; - for_each_document_change( + extract( document_changes, &extractor, indexing_context, extractor_allocs, &datastore, + finished_steps, + total_steps, + step_name, )?; } { diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 8a18eb074..fb02b2c93 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -13,17 +13,25 @@ use grenad::Merger; pub use searchable::*; pub use vectors::EmbeddingExtractor; -use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal}; +use super::indexer::document_changes::{ + DocumentChanges, FullySend, IndexingContext, Progress, ThreadLocal, +}; use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::Result; pub trait DocidsExtractor { - fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>, MSP, SP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &mut ThreadLocal>>, - ) -> Result>; + finished_steps: u16, + total_steps: u16, + step_name: &'static str, + ) -> Result> + where + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync; } /// TODO move in permissive json pointer diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 80f36b01d..b9e4803c7 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -12,8 +12,8 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ - for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, - IndexingContext, RefCellExt, ThreadLocal, + extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, + Progress, RefCellExt, ThreadLocal, }; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; @@ -341,12 +341,19 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> { pub struct WordDocidsExtractors; impl WordDocidsExtractors { - pub fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>, MSP, SP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &mut ThreadLocal>>, - ) -> Result { + finished_steps: u16, + total_steps: u16, + step_name: &'static str, + ) -> Result + where + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, + { let max_memory = grenad_parameters.max_memory_by_thread(); let index = indexing_context.index; @@ -391,12 +398,15 @@ impl WordDocidsExtractors { max_memory, }; - for_each_document_change( + extract( document_changes, &extractor, indexing_context, extractor_allocs, &datastore, + finished_steps, + total_steps, + step_name, )?; } diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index dc429b1ba..e16e83167 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -17,8 +17,8 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; use super::DocidsExtractor; use crate::update::new::indexer::document_changes::{ - for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, - IndexingContext, ThreadLocal, + extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, + Progress, ThreadLocal, }; use crate::update::new::DocumentChange; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; @@ -69,12 +69,19 @@ impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> } pub trait SearchableExtractor: Sized + Sync { - fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>, MSP, SP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &mut ThreadLocal>>, - ) -> Result> { + finished_steps: u16, + total_steps: u16, + step_name: &'static str, + ) -> Result> + where + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, + { let max_memory = grenad_parameters.max_memory_by_thread(); let rtxn = indexing_context.index.read_txn()?; @@ -118,12 +125,15 @@ pub trait SearchableExtractor: Sized + Sync { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - for_each_document_change( + extract( document_changes, &extractor_data, indexing_context, extractor_allocs, &datastore, + finished_steps, + total_steps, + step_name, )?; } { @@ -168,17 +178,27 @@ pub trait SearchableExtractor: Sized + Sync { } impl DocidsExtractor for T { - fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>, MSP, SP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &mut ThreadLocal>>, - ) -> Result> { + finished_steps: u16, + total_steps: u16, + step_name: &'static str, + ) -> Result> + where + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, + { Self::run_extraction( grenad_parameters, document_changes, indexing_context, extractor_allocs, + finished_steps, + total_steps, + step_name, ) } } diff --git a/milli/src/update/new/indexer/document_changes.rs b/milli/src/update/new/indexer/document_changes.rs index fd16137b9..aad190269 100644 --- a/milli/src/update/new/indexer/document_changes.rs +++ b/milli/src/update/new/indexer/document_changes.rs @@ -9,7 +9,7 @@ use rayon::iter::IndexedParallelIterator; use super::super::document_change::DocumentChange; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; -use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result}; +use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result}; pub trait RefCellExt { fn try_borrow_or_yield(&self) -> std::result::Result, std::cell::BorrowError>; @@ -335,6 +335,12 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload fn iter(&self, chunk_size: usize) -> impl IndexedParallelIterator>; + fn len(&self) -> usize; + + fn is_empty(&self) -> bool { + self.len() == 0 + } + fn item_to_document_change<'doc, // lifetime of a single `process` call T: MostlySend>( &'doc self, @@ -344,22 +350,72 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload ; } -#[derive(Clone, Copy)] pub struct IndexingContext< 'fid, // invariant lifetime of fields ids map 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index -> { + MSP, + SP, +> where + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, +{ pub index: &'index Index, pub db_fields_ids_map: &'indexer FieldsIdsMap, pub new_fields_ids_map: &'fid RwLock, pub doc_allocs: &'indexer ThreadLocal>>, pub fields_ids_map_store: &'indexer ThreadLocal>>>, + pub must_stop_processing: &'indexer MSP, + pub send_progress: &'indexer SP, +} + +impl< + 'fid, // invariant lifetime of fields ids map + 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation + 'index, // covariant lifetime of the index + MSP, + SP, + > Copy + for IndexingContext< + 'fid, // invariant lifetime of fields ids map + 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation + 'index, // covariant lifetime of the index + MSP, + SP, + > +where + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, +{ +} + +impl< + 'fid, // invariant lifetime of fields ids map + 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation + 'index, // covariant lifetime of the index + MSP, + SP, + > Clone + for IndexingContext< + 'fid, // invariant lifetime of fields ids map + 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation + 'index, // covariant lifetime of the index + MSP, + SP, + > +where + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, +{ + fn clone(&self) -> Self { + *self + } } const CHUNK_SIZE: usize = 100; -pub fn for_each_document_change< +#[allow(clippy::too_many_arguments)] +pub fn extract< 'pl, // covariant lifetime of the underlying payload 'extractor, // invariant lifetime of extractor_alloc 'fid, // invariant lifetime of fields ids map @@ -368,6 +424,8 @@ pub fn for_each_document_change< 'index, // covariant lifetime of the index EX, DC: DocumentChanges<'pl>, + MSP, + SP, >( document_changes: &DC, extractor: &EX, @@ -377,20 +435,29 @@ pub fn for_each_document_change< new_fields_ids_map, doc_allocs, fields_ids_map_store, - }: IndexingContext<'fid, 'indexer, 'index>, + must_stop_processing, + send_progress, + }: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &'extractor mut ThreadLocal>>, datastore: &'data ThreadLocal, + finished_steps: u16, + total_steps: u16, + step_name: &'static str, ) -> Result<()> where EX: Extractor<'extractor>, + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, { // Clean up and reuse the extractor allocs for extractor_alloc in extractor_allocs.iter_mut() { extractor_alloc.0.get_mut().reset(); } + let total_documents = document_changes.len(); + let pi = document_changes.iter(CHUNK_SIZE); - pi.try_arc_for_each_try_init( + pi.enumerate().try_arc_for_each_try_init( || { DocumentChangeContext::new( index, @@ -403,7 +470,19 @@ where move |index_alloc| extractor.init_data(index_alloc), ) }, - |context, items| { + |context, (finished_documents, items)| { + if (must_stop_processing)() { + return Err(Arc::new(InternalError::AbortedIndexation.into())); + } + let finished_documents = finished_documents * CHUNK_SIZE; + + (send_progress)(Progress { + finished_steps, + total_steps, + step_name, + finished_total_documents: Some((finished_documents as u32, total_documents as u32)), + }); + // Clean up and reuse the document-specific allocator context.doc_alloc.reset(); @@ -419,5 +498,21 @@ where res }, - ) + )?; + + (send_progress)(Progress { + finished_steps, + total_steps, + step_name, + finished_total_documents: Some((total_documents as u32, total_documents as u32)), + }); + + Ok(()) +} + +pub struct Progress { + pub finished_steps: u16, + pub total_steps: u16, + pub step_name: &'static str, + pub finished_total_documents: Option<(u32, u32)>, } diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index d193b65fa..130560a44 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -75,6 +75,10 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { Ok(Some(DocumentChange::Deletion(Deletion::create(*docid, external_document_id)))) } + + fn len(&self) -> usize { + self.to_delete.len() + } } #[cfg(test)] @@ -89,8 +93,7 @@ mod test { use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::tests::TempIndex; use crate::update::new::indexer::document_changes::{ - for_each_document_change, DocumentChangeContext, Extractor, IndexingContext, MostlySend, - ThreadLocal, + extract, DocumentChangeContext, Extractor, IndexingContext, MostlySend, ThreadLocal, }; use crate::update::new::indexer::DocumentDeletion; use crate::update::new::DocumentChange; @@ -165,17 +168,22 @@ mod test { new_fields_ids_map: &fields_ids_map, doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, + must_stop_processing: &(|| false), + send_progress: &(|_progress| {}), }; for _ in 0..3 { let datastore = ThreadLocal::new(); - for_each_document_change( + extract( &changes, &deletion_tracker, context, &mut extractor_allocs, &datastore, + 0, + 1, + "test", ) .unwrap(); diff --git a/milli/src/update/new/indexer/document_operation.rs b/milli/src/update/new/indexer/document_operation.rs index 9ba74c69e..c0f1ffbdd 100644 --- a/milli/src/update/new/indexer/document_operation.rs +++ b/milli/src/update/new/indexer/document_operation.rs @@ -241,6 +241,10 @@ impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> { )?; Ok(change) } + + fn len(&self) -> usize { + self.docids_version_offsets.len() + } } trait MergeChanges { diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 62ad05813..3bee9904f 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -5,8 +5,8 @@ use std::thread::{self, Builder}; use big_s::S; use bumpalo::Bump; use document_changes::{ - for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, - IndexingContext, RefCellExt, ThreadLocal, + extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, + Progress, RefCellExt, ThreadLocal, }; pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; @@ -72,7 +72,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { context: &DocumentChangeContext, ) -> Result<()> { let mut document_buffer = Vec::new(); - let mut field_distribution_delta = context.data.0.borrow_mut(); + let mut field_distribution_delta = context.data.0.borrow_mut_or_yield(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); @@ -155,13 +155,70 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { } } +mod steps { + pub const STEPS: &[&str] = &[ + "extracting documents", + "extracting facets", + "extracting words", + "extracting word proximity", + "extracting embeddings", + "writing to database", + "post-processing facets", + "post-processing words", + "finalizing", + ]; + + const fn step(step: u16) -> (u16, &'static str) { + (step, STEPS[step as usize]) + } + + pub const fn total_steps() -> u16 { + STEPS.len() as u16 + } + + pub const fn extract_documents() -> (u16, &'static str) { + step(0) + } + + pub const fn extract_facets() -> (u16, &'static str) { + step(1) + } + + pub const fn extract_words() -> (u16, &'static str) { + step(2) + } + + pub const fn extract_word_proximity() -> (u16, &'static str) { + step(3) + } + + pub const fn extract_embeddings() -> (u16, &'static str) { + step(4) + } + + pub const fn write_db() -> (u16, &'static str) { + step(5) + } + + pub const fn post_processing_facets() -> (u16, &'static str) { + step(6) + } + pub const fn post_processing_words() -> (u16, &'static str) { + step(7) + } + + pub const fn finalizing() -> (u16, &'static str) { + step(8) + } +} + /// This is the main function of this crate. /// /// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. /// /// TODO return stats #[allow(clippy::too_many_arguments)] // clippy: 😝 -pub fn index<'pl, 'indexer, 'index, DC>( +pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( wtxn: &mut RwTxn, index: &'index Index, db_fields_ids_map: &'indexer FieldsIdsMap, @@ -170,9 +227,13 @@ pub fn index<'pl, 'indexer, 'index, DC>( pool: &ThreadPool, document_changes: &DC, embedders: EmbeddingConfigs, + must_stop_processing: &'indexer MSP, + send_progress: &'indexer SP, ) -> Result<()> where DC: DocumentChanges<'pl>, + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, { let (merger_sender, writer_receiver) = merger_writer_channel(10_000); // This channel acts as a rendezvous point to ensure that we are one task ahead @@ -194,8 +255,12 @@ where new_fields_ids_map: &new_fields_ids_map, doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, + must_stop_processing, + send_progress, }; + let total_steps = steps::total_steps(); + let mut field_distribution = index.field_distribution(wtxn)?; thread::scope(|s| -> Result<()> { @@ -213,7 +278,8 @@ where let document_sender = extractor_sender.document_sender(); let document_extractor = DocumentExtractor { document_sender: &document_sender, embedders }; let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); - for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?; + let (finished_steps, step_name) = steps::extract_documents(); + extract(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; for field_distribution_delta in datastore { let field_distribution_delta = field_distribution_delta.0.into_inner(); @@ -238,22 +304,29 @@ where { let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); let _entered = span.enter(); + let (finished_steps, step_name) = steps::extract_facets(); extract_and_send_docids::< _, FacetedDocidsExtractor, FacetDocids, + _, + _ >( grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, &extractor_sender, + finished_steps, + total_steps, + step_name )?; } { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); let _entered = span.enter(); + let (finished_steps, step_name) = steps::extract_words(); let WordDocidsMergers { word_fid_docids, @@ -261,7 +334,7 @@ where exact_word_docids, word_position_docids, fid_word_count_docids, - } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?; + } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?; extractor_sender.send_searchable::(word_docids).unwrap(); extractor_sender.send_searchable::(word_fid_docids).unwrap(); extractor_sender.send_searchable::(exact_word_docids).unwrap(); @@ -276,16 +349,24 @@ where if proximity_precision == ProximityPrecision::ByWord { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); let _entered = span.enter(); + let (finished_steps, step_name) = steps::extract_word_proximity(); + + extract_and_send_docids::< _, WordPairProximityDocidsExtractor, WordPairProximityDocids, + _, + _ >( grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, &extractor_sender, + finished_steps, + total_steps, + step_name, )?; } @@ -301,8 +382,10 @@ where let embedding_sender = todo!(); let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, &field_distribution, request_threads()); let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); + let (finished_steps, step_name) = steps::extract_embeddings(); - for_each_document_change(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore)?; + + extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; let mut user_provided = HashMap::new(); @@ -325,6 +408,8 @@ where { let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); let _entered = span.enter(); + let (finished_steps, step_name) = steps::write_db(); + (indexing_context.send_progress)(Progress { finished_steps, total_steps, step_name, finished_total_documents: None }); } // TODO THIS IS TOO MUCH @@ -501,15 +586,38 @@ where /// TODO handle the panicking threads handle.join().unwrap()?; let merger_result = merger_thread.join().unwrap()?; + let (finished_steps, step_name) = steps::post_processing_facets(); + (indexing_context.send_progress)(Progress { + finished_steps, + total_steps, + step_name, + finished_total_documents: None, + }); if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta { compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; } + let (finished_steps, step_name) = steps::post_processing_words(); + (indexing_context.send_progress)(Progress { + finished_steps, + total_steps, + step_name, + finished_total_documents: None, + }); + if let Some(prefix_delta) = merger_result.prefix_delta { compute_prefix_database(index, wtxn, prefix_delta)?; } + let (finished_steps, step_name) = steps::finalizing(); + (indexing_context.send_progress)(Progress { + finished_steps, + total_steps, + step_name, + finished_total_documents: None, + }); + Ok(()) as Result<_> })?; @@ -585,6 +693,7 @@ fn compute_facet_level_database( /// TODO: GrenadParameters::default() should be removed in favor a passed parameter /// TODO: manage the errors correctly /// TODO: we must have a single trait that also gives the extractor type +#[allow(clippy::too_many_arguments)] fn extract_and_send_docids< 'pl, 'fid, @@ -593,15 +702,31 @@ fn extract_and_send_docids< DC: DocumentChanges<'pl>, E: DocidsExtractor, D: MergerOperationType, + MSP, + SP, >( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &mut ThreadLocal>>, sender: &ExtractorSender, -) -> Result<()> { - let merger = - E::run_extraction(grenad_parameters, document_changes, indexing_context, extractor_allocs)?; + finished_steps: u16, + total_steps: u16, + step_name: &'static str, +) -> Result<()> +where + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, +{ + let merger = E::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + extractor_allocs, + finished_steps, + total_steps, + step_name, + )?; sender.send_searchable::(merger).unwrap(); Ok(()) } diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 3913098ec..e58141af7 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -79,4 +79,8 @@ where let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); Ok(Some(DocumentChange::Insertion(insertion))) } + + fn len(&self) -> usize { + self.iter.len() + } } diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index b08f8c380..3eb0cc306 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -9,7 +9,7 @@ use super::DocumentChanges; use crate::documents::Error::InvalidDocumentFormat; use crate::documents::PrimaryKey; use crate::error::{FieldIdMapMissingEntry, InternalError}; -use crate::update::new::document::{DocumentFromVersions, Versions}; +use crate::update::new::document::Versions; use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update}; use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; @@ -176,6 +176,10 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { }, } } + + fn len(&self) -> usize { + self.documents.len() + } } fn obkv_to_rhaimap(obkv: &KvReaderFieldId, fields_ids_map: &FieldsIdsMap) -> Result { From ad52c950ba614f0c7da0f0f4116779bb373c3715 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 9 Oct 2024 11:35:45 +0200 Subject: [PATCH 185/247] Only run word pair proximity docids extraction if proximity_precision enables it --- Cargo.lock | 1470 ++++++++--------- index-scheduler/src/batch.rs | 8 +- meili-snap/Cargo.toml | 2 +- milli/Cargo.toml | 4 +- .../index_documents/helpers/grenad_helpers.rs | 2 + milli/src/update/new/channel.rs | 300 +--- milli/src/update/new/extract/cache.rs | 784 ++++++--- milli/src/update/new/extract/documents.rs | 73 + .../new/extract/faceted/extract_facets.rs | 118 +- milli/src/update/new/extract/lru.rs | 234 --- milli/src/update/new/extract/mod.rs | 16 +- .../extract/searchable/extract_word_docids.rs | 319 ++-- .../extract_word_pair_proximity_docids.rs | 22 +- .../src/update/new/extract/searchable/mod.rs | 91 +- .../extract/searchable/tokenize_document.rs | 1 - milli/src/update/new/facet_search_builder.rs | 49 +- milli/src/update/new/indexer/de.rs | 2 +- .../update/new/indexer/document_changes.rs | 25 +- .../update/new/indexer/document_deletion.rs | 15 +- milli/src/update/new/indexer/mod.rs | 406 ++--- milli/src/update/new/indexer/partial_dump.rs | 4 +- .../update/new/indexer/update_by_function.rs | 6 +- milli/src/update/new/merger.rs | 375 ++--- milli/src/update/new/mod.rs | 3 + milli/src/update/new/parallel_iterator_ext.rs | 43 +- milli/src/update/new/words_prefix_docids.rs | 109 +- 26 files changed, 1977 insertions(+), 2504 deletions(-) create mode 100644 milli/src/update/new/extract/documents.rs delete mode 100644 milli/src/update/new/extract/lru.rs diff --git a/Cargo.lock b/Cargo.lock index 5cd1f3976..633fdca8f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,11 +4,11 @@ version = 3 [[package]] name = "actix-codec" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "617a8268e3537fe1d8c9ead925fca49ef6400927ee7bc26750e90ecee14ce4b8" +checksum = "5f7b0a21988c1bf877cf4759ef5ddaac04c1c9fe808c9142ecb78ba97d97a28a" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", "bytes", "futures-core", "futures-sink", @@ -36,9 +36,9 @@ dependencies = [ [[package]] name = "actix-http" -version = "3.8.0" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ae682f693a9cd7b058f2b0b5d9a6d7728a8555779bedbbc35dd88528611d020" +checksum = "d48f96fc3003717aeb9856ca3d02a8c7de502667ad76eeacd830b48d2e91fac4" dependencies = [ "actix-codec", "actix-rt", @@ -56,7 +56,7 @@ dependencies = [ "flate2", "futures-core", "h2 0.3.26", - "http 0.2.11", + "http 0.2.12", "httparse", "httpdate", "itoa", @@ -80,7 +80,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -91,7 +91,7 @@ checksum = "13d324164c51f63867b57e73ba5936ea151b8a41a1d23d1031eeb9f70d0236f8" dependencies = [ "bytestring", "cfg-if", - "http 0.2.11", + "http 0.2.12", "regex-lite", "serde", "tracing", @@ -110,9 +110,9 @@ dependencies = [ [[package]] name = "actix-server" -version = "2.2.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e8613a75dd50cc45f473cee3c34d59ed677c0f7b44480ce3b8247d7dc519327" +checksum = "7ca2549781d8dd6d75c40cf6b6051260a2cc2f3c62343d761a969a0640646894" dependencies = [ "actix-rt", "actix-service", @@ -120,8 +120,7 @@ dependencies = [ "futures-core", "futures-util", "mio", - "num_cpus", - "socket2 0.4.9", + "socket2", "tokio", "tracing", ] @@ -168,9 +167,9 @@ dependencies = [ [[package]] name = "actix-web" -version = "4.8.0" +version = "4.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1988c02af8d2b718c05bc4aeb6a66395b7cdf32858c2c71131e5637a8c05a9ff" +checksum = "9180d76e5cc7ccbc4d60a506f2c727730b154010262df5b910eb17dbe4b8cb38" dependencies = [ "actix-codec", "actix-http", @@ -191,6 +190,7 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", + "impl-more", "itoa", "language-tags", "log", @@ -202,7 +202,7 @@ dependencies = [ "serde_json", "serde_urlencoded", "smallvec", - "socket2 0.5.5", + "socket2", "time", "url", ] @@ -216,23 +216,23 @@ dependencies = [ "actix-router", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] name = "addr2line" -version = "0.20.0" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] [[package]] -name = "adler" -version = "1.0.2" +name = "adler2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" [[package]] name = "aes" @@ -308,57 +308,58 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.13" +version = "0.6.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" +checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", + "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.6" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56" [[package]] name = "anstyle-parse" -version = "0.2.1" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.0" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] name = "anstyle-wincon" -version = "3.0.1" +version = "3.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0699d10d2f4d628a98ee7b57b289abbc98ff3bad977cb3152709d4bf2330628" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" dependencies = [ "anstyle", - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] name = "anyhow" -version = "1.0.86" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" +checksum = "c042108f3ed77fd83760a5fd79b53be043192bb3b9dba91d8c574c0ada7850c8" dependencies = [ "backtrace", ] @@ -380,9 +381,9 @@ dependencies = [ [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arroy" @@ -415,13 +416,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.81" +version = "0.1.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" +checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -432,23 +433,23 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.2.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "backtrace" -version = "0.3.68" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" dependencies = [ "addr2line", - "cc", "cfg-if", "libc", "miniz_oxide", "object", "rustc-demangle", + "windows-targets 0.52.6", ] [[package]] @@ -514,22 +515,20 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.69.4" +version = "0.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" +checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" dependencies = [ "bitflags 2.6.0", "cexpr", "clang-sys", - "itertools 0.12.1", - "lazy_static", - "lazycell", + "itertools 0.13.0", "proc-macro2", "quote", "regex", - "rustc-hash", + "rustc-hash 1.1.0", "shlex", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -614,7 +613,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", "syn_derive", ] @@ -641,9 +640,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" +checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" dependencies = [ "memchr", "regex-automata", @@ -704,28 +703,28 @@ dependencies = [ [[package]] name = "bytecount" -version = "0.6.3" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" [[package]] name = "bytemuck" -version = "1.16.1" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" dependencies = [ "bytemuck_derive", ] [[package]] name = "bytemuck_derive" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4da9a32f3fed317401fa3c862968128267c3106685286e15d5aaa3d7389c2f60" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -736,15 +735,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" [[package]] name = "bytestring" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "238e4886760d98c4f899360c834fa93e62cf7f721ac3c2da375cbdf4b8679aae" +checksum = "74d80203ea6b29df88012294f62733de21cfeab47f17b41af3a38bc30a03ee72" dependencies = [ "bytes", ] @@ -772,9 +771,9 @@ dependencies = [ [[package]] name = "camino" -version = "1.1.6" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +checksum = "8b96ec4966b5813e2c0507c1f86115c8c5abaadc3980879c3424042a02fd1ad3" dependencies = [ "serde", ] @@ -789,7 +788,7 @@ dependencies = [ "candle-kernels", "cudarc", "gemm", - "half 2.4.0", + "half", "memmap2", "num-traits", "num_cpus", @@ -818,7 +817,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b006b30f66a0d94fc9cef0ac4de6ce510565f35ae2c6c35ce5d4aacfb0fc8eeb" dependencies = [ "candle-core", - "half 2.4.0", + "half", "num-traits", "rayon", "safetensors", @@ -847,9 +846,9 @@ dependencies = [ [[package]] name = "cargo-platform" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ceed8ef69d8518a5dda55c07425450b58a4e1946f4951eab6d7191ee86c2443d" +checksum = "24b1f0365a6c6bb4020cd05806fd0d33c44d38046b8bd7f0e40814b9763cabfc" dependencies = [ "serde", ] @@ -870,9 +869,9 @@ dependencies = [ [[package]] name = "cargo_toml" -version = "0.20.3" +version = "0.20.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4895c018bb228aa6b3ba1a0285543fcb4b704734c3fb1f72afaa75aa769500c1" +checksum = "88da5a13c620b4ca0078845707ea9c3faf11edbc3ffd8497d11d686211cd1ac0" dependencies = [ "serde", "toml", @@ -886,13 +885,13 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.104" +version = "1.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74b6a57f98764a267ff415d50a25e6e166f3831a5071af4995296ea97d210490" +checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" dependencies = [ "jobserver", "libc", - "once_cell", + "shlex", ] [[package]] @@ -958,9 +957,9 @@ dependencies = [ [[package]] name = "ciborium" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" dependencies = [ "ciborium-io", "ciborium-ll", @@ -969,18 +968,18 @@ dependencies = [ [[package]] name = "ciborium-io" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" [[package]] name = "ciborium-ll" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" dependencies = [ "ciborium-io", - "half 1.8.2", + "half", ] [[package]] @@ -995,9 +994,9 @@ dependencies = [ [[package]] name = "clang-sys" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" dependencies = [ "glob", "libc", @@ -1006,9 +1005,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.9" +version = "4.5.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64acc1846d54c1fe936a78dc189c34e28d3f5afc348403f28ecf53660b9b8462" +checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" dependencies = [ "clap_builder", "clap_derive", @@ -1016,9 +1015,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.9" +version = "4.5.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb8393d67ba2e7bfaf28a23458e4e2b543cc73a99595511eb207fdb8aede942" +checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" dependencies = [ "anstream", "anstyle", @@ -1028,21 +1027,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.8" +version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] name = "clap_lex" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" [[package]] name = "color-spantrace" @@ -1058,9 +1057,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.0" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "concat-arrays" @@ -1075,15 +1074,15 @@ dependencies = [ [[package]] name = "console" -version = "0.15.7" +version = "0.15.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c926e00cc70edefdc64d3a5ff31cc65bb97a3460097762bd23afb4d8145fccf8" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" dependencies = [ "encode_unicode", "lazy_static", "libc", "unicode-width", - "windows-sys 0.45.0", + "windows-sys 0.52.0", ] [[package]] @@ -1108,9 +1107,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "convert_case" @@ -1140,15 +1139,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "cpufeatures" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" dependencies = [ "libc", ] @@ -1308,11 +1307,11 @@ dependencies = [ [[package]] name = "cudarc" -version = "0.11.7" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56ee2a3fbbd981e1c7ea73cc2af136e754eb22d17436de37155227ee4dbe0cf4" +checksum = "7a5bd4d1eee570c3b2ac64ed114125517dd1e541d88dd28fc259f1de4dba8d60" dependencies = [ - "half 2.4.0", + "half", "libloading", ] @@ -1328,12 +1327,12 @@ dependencies = [ [[package]] name = "darling" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83b2eb4d90d12bdda5ed17de686c2acb4c57914f8f921b8da7e112b5a36f3fe1" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ - "darling_core 0.20.9", - "darling_macro 0.20.9", + "darling_core 0.20.10", + "darling_macro 0.20.10", ] [[package]] @@ -1352,16 +1351,16 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622687fe0bac72a04e5599029151f5796111b90f1baaa9b544d807a5e31cd120" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -1377,13 +1376,13 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ - "darling_core 0.20.9", + "darling_core 0.20.10", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -1415,9 +1414,9 @@ dependencies = [ [[package]] name = "deflate64" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83ace6c86376be0b6cdcf3fb41882e81d94b31587573d1cfa9d01cd06bba210d" +checksum = "da692b8d1080ea3045efaab14434d40468c3d8657e42abddfffca87b428f4c1b" [[package]] name = "deranged" @@ -1437,7 +1436,7 @@ checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -1451,11 +1450,11 @@ dependencies = [ [[package]] name = "derive_builder" -version = "0.20.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" dependencies = [ - "derive_builder_macro 0.20.0", + "derive_builder_macro 0.20.2", ] [[package]] @@ -1472,14 +1471,14 @@ dependencies = [ [[package]] name = "derive_builder_core" -version = "0.20.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ - "darling 0.20.9", + "darling 0.20.10", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -1494,25 +1493,25 @@ dependencies = [ [[package]] name = "derive_builder_macro" -version = "0.20.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ - "derive_builder_core 0.20.0", - "syn 2.0.60", + "derive_builder_core 0.20.2", + "syn 2.0.85", ] [[package]] name = "derive_more" -version = "0.99.17" +version = "0.99.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" +checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" dependencies = [ "convert_case 0.4.0", "proc-macro2", "quote", "rustc_version", - "syn 1.0.109", + "syn 2.0.85", ] [[package]] @@ -1541,7 +1540,7 @@ dependencies = [ "convert_case 0.6.0", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -1605,7 +1604,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -1738,9 +1737,9 @@ checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" [[package]] name = "encoding_rs" -version = "0.8.33" +version = "0.8.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" dependencies = [ "cfg-if", ] @@ -1756,14 +1755,14 @@ dependencies = [ [[package]] name = "enum-as-inner" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ffccbb6966c05b32ef8fbac435df276c4ae4d3dc55a8cd0eb9745e6c12f546a" +checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" dependencies = [ - "heck 0.4.1", + "heck", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -1783,7 +1782,7 @@ checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -1794,9 +1793,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", "windows-sys 0.52.0", @@ -1831,9 +1830,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "file-store" @@ -1847,14 +1846,14 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.22" +version = "0.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0" +checksum = "35c0522e981e68cbfa8c3f978441a5f34b30b96e146b33cd3359176b50fe8586" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.3.5", - "windows-sys 0.48.0", + "libredox", + "windows-sys 0.59.0", ] [[package]] @@ -1869,9 +1868,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.30" +version = "1.0.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" dependencies = [ "crc32fast", "miniz_oxide", @@ -1920,9 +1919,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "futures" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -1935,9 +1934,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -1945,15 +1944,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -1962,38 +1961,38 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -2101,7 +2100,7 @@ checksum = "a2e7ea062c987abcd8db95db917b4ffb4ecdfd0668471d8dc54734fdff2354e8" dependencies = [ "bytemuck", "dyn-stack", - "half 2.4.0", + "half", "num-complex", "num-traits", "once_cell", @@ -2122,7 +2121,7 @@ dependencies = [ "dyn-stack", "gemm-common", "gemm-f32", - "half 2.4.0", + "half", "num-complex", "num-traits", "paste", @@ -2179,9 +2178,9 @@ checksum = "36d244a08113319b5ebcabad2b8b7925732d15eec46d7e7ac3c11734f3b7a6ad" [[package]] name = "getrandom" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "js-sys", @@ -2190,23 +2189,11 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "getset" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9" -dependencies = [ - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "gimli" -version = "0.27.3" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "git2" @@ -2250,7 +2237,7 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http 0.2.11", + "http 0.2.12", "indexmap", "slab", "tokio", @@ -2260,9 +2247,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" dependencies = [ "atomic-waker", "bytes", @@ -2279,15 +2266,9 @@ dependencies = [ [[package]] name = "half" -version = "1.8.2" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" - -[[package]] -name = "half" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eceaaeec696539ddaf7b333340f1af35a5aa87ae3e4f3ead0532f72affab2e" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" dependencies = [ "bytemuck", "cfg-if", @@ -2347,12 +2328,6 @@ dependencies = [ "stable_deref_trait", ] -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" @@ -2361,9 +2336,9 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "heed" -version = "0.20.3" +version = "0.20.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bc30da4a93ff8cb98e535d595d6de42731d4719d707bc1c86f579158751a24e" +checksum = "7d4f449bab7320c56003d37732a917e18798e2f1709d80263face2b4f9436ddb" dependencies = [ "bitflags 2.6.0", "byteorder", @@ -2402,6 +2377,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "hex" version = "0.4.3" @@ -2435,9 +2416,9 @@ dependencies = [ [[package]] name = "http" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8947b1a6fad4393052c7ba1f4cd97bed3e953a95c79c92ad9b051a04611d9fbb" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" dependencies = [ "bytes", "fnv", @@ -2457,9 +2438,9 @@ dependencies = [ [[package]] name = "http-body" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", "http 1.1.0", @@ -2480,26 +2461,26 @@ dependencies = [ [[package]] name = "httparse" -version = "1.8.0" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" [[package]] name = "httpdate" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] name = "hyper" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a" dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "http 1.1.0", "http-body", "httparse", @@ -2513,9 +2494,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.2" +version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" +checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", "http 1.1.0", @@ -2531,9 +2512,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.6" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab92f4f49ee4fb4f997c784b7a2e0fa70050211e0b6a287f898c3c9785ca956" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" dependencies = [ "bytes", "futures-channel", @@ -2542,9 +2523,8 @@ dependencies = [ "http-body", "hyper", "pin-project-lite", - "socket2 0.5.5", + "socket2", "tokio", - "tower", "tower-service", "tracing", ] @@ -2567,9 +2547,9 @@ dependencies = [ [[package]] name = "impl-more" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d" +checksum = "aae21c3177a27788957044151cc2800043d127acaa460a47ebb9b84dfa2c6aa0" [[package]] name = "index-scheduler" @@ -2582,7 +2562,7 @@ dependencies = [ "bumpalo", "crossbeam", "csv", - "derive_builder 0.20.0", + "derive_builder 0.20.2", "dump", "enum-iterator", "file-store", @@ -2609,20 +2589,20 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown 0.14.5", + "hashbrown 0.15.0", "serde", ] [[package]] name = "indicatif" -version = "0.17.7" +version = "0.17.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25" +checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" dependencies = [ "console", "instant", @@ -2657,18 +2637,18 @@ dependencies = [ [[package]] name = "instant" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" dependencies = [ "cfg-if", ] [[package]] name = "ipnet" -version = "2.8.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "irg-kvariants" @@ -2682,15 +2662,21 @@ dependencies = [ [[package]] name = "is-terminal" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" dependencies = [ - "hermit-abi", + "hermit-abi 0.4.0", "libc", "windows-sys 0.52.0", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.10.5" @@ -2740,7 +2726,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1e2b0210dc78b49337af9e49d7ae41a39dceac6e5985613f1cf7763e2f76a25" dependencies = [ "cedarwood", - "derive_builder 0.20.0", + "derive_builder 0.20.2", "fxhash", "lazy_static", "phf", @@ -2750,18 +2736,18 @@ dependencies = [ [[package]] name = "jobserver" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" dependencies = [ "wasm-bindgen", ] @@ -2800,9 +2786,9 @@ dependencies = [ [[package]] name = "kstring" -version = "2.0.0" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3066350882a1cd6d950d055997f379ac37fd39f81cd4d8ed186032eb3c5747" +checksum = "558bf9508a558512042d3095138b1f7b8fe90c5467d94f9f1da28b3731c5dbd1" dependencies = [ "serde", "static_assertions", @@ -2820,12 +2806,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - [[package]] name = "levenshtein_automata" version = "0.2.1" @@ -2837,9 +2817,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.155" +version = "0.2.161" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" [[package]] name = "libgit2-sys" @@ -2855,19 +2835,19 @@ dependencies = [ [[package]] name = "libloading" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e310b3a6b5907f99202fcdb4960ff45b93735d7c7d96b760fcff8db2dc0e103d" +checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets 0.52.4", + "windows-targets 0.48.5", ] [[package]] name = "libm" -version = "0.2.8" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" [[package]] name = "libmimalloc-sys" @@ -2881,9 +2861,9 @@ dependencies = [ [[package]] name = "libproc" -version = "0.14.8" +version = "0.14.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae9ea4b75e1a81675429dafe43441df1caea70081e82246a8cccf514884a88bb" +checksum = "e78a09b56be5adbcad5aa1197371688dc6bb249a26da3bca2011ee2fb987ebfb" dependencies = [ "bindgen", "errno", @@ -2891,10 +2871,21 @@ dependencies = [ ] [[package]] -name = "libz-sys" -version = "1.1.15" +name = "libredox" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "037731f5d3aaa87a5675e895b63ddff1a87624bc29f77004ea829809654e48f6" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags 2.6.0", + "libc", + "redox_syscall", +] + +[[package]] +name = "libz-sys" +version = "1.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2d16453e800a8cf6dd2fc3eb4bc99b786a9b90c663b8559a5b1a041bf89e472" dependencies = [ "cc", "libc", @@ -3058,7 +3049,7 @@ dependencies = [ "bincode", "byteorder", "csv", - "derive_builder 0.20.0", + "derive_builder 0.20.2", "encoding", "encoding_rs", "encoding_rs_io", @@ -3223,15 +3214,15 @@ checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" [[package]] name = "linux-raw-sys" -version = "0.4.12" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "liquid" -version = "0.26.6" +version = "0.26.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10929f201279ba14da3297b957dcda1e0bf7a6f3bb5115688be684aa8864e9cc" +checksum = "7cdcc72b82748f47c2933c172313f5a9aea5b2c4eb3fa4c66b4ea55bb60bb4b1" dependencies = [ "doc-comment", "liquid-core", @@ -3242,12 +3233,12 @@ dependencies = [ [[package]] name = "liquid-core" -version = "0.26.6" +version = "0.26.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3aef4b2160791f456eb880c990a97746f693746f92302ef5f1d06111cf14b768" +checksum = "2752e978ffc53670f3f2e8b3ef09f348d6f7b5474a3be3f8a5befe5382e4effb" dependencies = [ "anymap2", - "itertools 0.12.1", + "itertools 0.13.0", "kstring", "liquid-derive", "num-traits", @@ -3260,22 +3251,22 @@ dependencies = [ [[package]] name = "liquid-derive" -version = "0.26.5" +version = "0.26.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "915f6d0a2963a27cd5205c1902f32ddfe3bc035816afd268cf88c0fc0f8d287e" +checksum = "3b51f1d220e3fa869e24cfd75915efe3164bd09bb11b3165db3f37f57bf673e3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] name = "liquid-lib" -version = "0.26.6" +version = "0.26.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f48fc446873f74d869582f5c4b8cbf3248c93395e410a67af5809b3731e44a" +checksum = "59b1a298d3d2287ee5b1e43840d885b8fdfc37d3f4e90d82aacfd04d021618da" dependencies = [ - "itertools 0.12.1", + "itertools 0.13.0", "liquid-core", "once_cell", "percent-encoding", @@ -3286,9 +3277,9 @@ dependencies = [ [[package]] name = "lmdb-master-sys" -version = "0.2.2" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57640c190703d5ccf4a86aff4aeb749b2d287a8cb1723c76b51f39d77ab53b24" +checksum = "472c3760e2a8d0f61f322fb36788021bb36d573c502b50fa3e2bcaac3ec326c9" dependencies = [ "cc", "doxygen-rs", @@ -3297,27 +3288,26 @@ dependencies = [ [[package]] name = "local-channel" -version = "0.1.3" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f303ec0e94c6c54447f84f3b0ef7af769858a9c4ef56ef2a986d3dcd4c3fc9c" +checksum = "b6cbc85e69b8df4b8bb8b89ec634e7189099cea8927a276b7384ce5488e53ec8" dependencies = [ "futures-core", "futures-sink", - "futures-util", "local-waker", ] [[package]] name = "local-waker" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e34f76eb3611940e0e7d53a9aaa4e6a3151f69541a282fd0dad5571420c53ff1" +checksum = "4d873d7c67ce09b42110d801813efbc9364414e356be9935700d368351657487" [[package]] name = "lock_api" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -3331,9 +3321,9 @@ checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e" [[package]] name = "log" -version = "0.4.21" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "lzma-rs" @@ -3370,7 +3360,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -3480,7 +3470,7 @@ dependencies = [ "uuid", "wiremock", "yaup", - "zip 2.1.3", + "zip 2.2.0", ] [[package]] @@ -3555,9 +3545,9 @@ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memmap2" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" +checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" dependencies = [ "libc", "stable_deref_trait", @@ -3567,6 +3557,7 @@ dependencies = [ name = "milli" version = "1.11.0" dependencies = [ + "allocator-api2", "arroy", "big_s", "bimap", @@ -3590,7 +3581,7 @@ dependencies = [ "fxhash", "geoutils", "grenad", - "hashbrown 0.14.5", + "hashbrown 0.15.0", "heed", "hf-hub", "indexmap", @@ -3615,6 +3606,7 @@ dependencies = [ "rhai", "roaring", "rstar", + "rustc-hash 2.0.0", "serde", "serde_json", "slice-group-by", @@ -3650,9 +3642,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "mime_guess" -version = "2.0.4" +version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" dependencies = [ "mime", "unicase", @@ -3666,30 +3658,31 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.7.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" dependencies = [ - "adler", + "adler2", ] [[package]] name = "mio" -version = "0.8.11" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ + "hermit-abi 0.3.9", "libc", "log", "wasi", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "monostate" -version = "0.1.9" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15f370ae88093ec6b11a710dec51321a61d420fafd1bad6e30d01bd9c920e8ee" +checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e" dependencies = [ "monostate-impl", "serde", @@ -3697,20 +3690,20 @@ dependencies = [ [[package]] name = "monostate-impl" -version = "0.1.9" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "371717c0a5543d6a800cac822eac735aa7d2d2fbb41002e9856a4089532dbdce" +checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] name = "mutually_exclusive_features" -version = "0.0.3" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d02c0b00610773bb7fc61d85e13d86c7858cbdf00e1a120bfc41bc055dbaa0e" +checksum = "e94e1e6445d314f972ff7395df2de295fe51b71821694f0b0e1e79c4f12c8577" [[package]] name = "nohash" @@ -3760,20 +3753,19 @@ dependencies = [ [[package]] name = "num-bigint" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ - "autocfg", "num-integer", "num-traits", ] [[package]] name = "num-complex" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ "bytemuck", "num-traits", @@ -3787,19 +3779,18 @@ checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" [[package]] name = "num-integer" -version = "0.1.45" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", "num-traits", ] [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", "libm", @@ -3811,29 +3802,29 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", ] [[package]] name = "num_enum" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" +checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179" dependencies = [ "num_enum_derive", ] [[package]] name = "num_enum_derive" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" +checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -3853,9 +3844,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" [[package]] name = "object" -version = "0.31.1" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" dependencies = [ "memchr", ] @@ -3867,9 +3858,9 @@ source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#ce53587 [[package]] name = "once_cell" -version = "1.19.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "onig" @@ -3895,9 +3886,9 @@ dependencies = [ [[package]] name = "oorandom" -version = "11.1.3" +version = "11.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" [[package]] name = "option-ext" @@ -3907,9 +3898,9 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" [[package]] name = "ordered-float" -version = "4.2.1" +version = "4.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19ff2cf528c6c03d9ed653d6c4ce1dc0582dc4af309790ad92f07c1cd551b0be" +checksum = "83e7ccb95e240b7c9506a3d544f10d935e142cc90b0a1d56954fb44d89ad6b97" dependencies = [ "num-traits", ] @@ -3948,22 +3939,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.8" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.3.5", + "redox_syscall", "smallvec", - "windows-targets 0.48.1", + "windows-targets 0.52.6", ] [[package]] name = "paste" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "path-matchers" @@ -3992,11 +3983,11 @@ dependencies = [ [[package]] name = "pem" -version = "3.0.3" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310" +checksum = "8e459365e590736a54c3fa561947c84837534b8e9af6fc5bf781307e82658fae" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "serde", ] @@ -4016,19 +4007,20 @@ dependencies = [ [[package]] name = "pest" -version = "2.7.2" +version = "2.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1acb4a4365a13f749a93f1a094a7805e5cfa0955373a9de860d962eaa3a5fe5a" +checksum = "879952a81a83930934cbf1786752d6dedc3b1f29e8f8fb2ad1d0a36f377cf442" dependencies = [ + "memchr", "thiserror", "ucd-trie", ] [[package]] name = "pest_derive" -version = "2.7.2" +version = "2.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "666d00490d4ac815001da55838c500eafb0320019bbaa44444137c48b443a853" +checksum = "d214365f632b123a47fd913301e14c946c61d1c183ee245fa76eb752e59a02dd" dependencies = [ "pest", "pest_generator", @@ -4036,22 +4028,22 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.7.2" +version = "2.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68ca01446f50dbda87c1786af8770d535423fa8a53aec03b8f4e3d7eb10e0929" +checksum = "eb55586734301717aea2ac313f50b2eb8f60d2fc3dc01d190eefa2e625f60c4e" dependencies = [ "pest", "pest_meta", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] name = "pest_meta" -version = "2.7.2" +version = "2.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56af0a30af74d0445c0bf6d9d051c979b516a1a5af790d251daee76005420a48" +checksum = "b75da2a70cf4d9cb76833c990ac9cd3923c9a8905a8929789ce347c84564d03d" dependencies = [ "once_cell", "pest", @@ -4098,7 +4090,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -4112,29 +4104,29 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.4" +version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0302c4a0442c456bd56f841aee5c3bfd17967563f6fadc9ceb9f9c23cf3807e0" +checksum = "be57f64e946e500c8ee36ef6331845d40a93055567ec57e8fae13efd33759b95" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.4" +version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "266c042b60c9c76b8d53061e52b2e0d1116abc57cefc8c5cd671619a56ac3690" +checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] name = "pin-project-lite" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" [[package]] name = "pin-utils" @@ -4150,9 +4142,9 @@ checksum = "16f2611cd06a1ac239a0cea4521de9eb068a6ca110324ee00631aa68daa74fc0" [[package]] name = "pkg-config" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "platform-dirs" @@ -4165,9 +4157,9 @@ dependencies = [ [[package]] name = "plotters" -version = "0.3.5" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" dependencies = [ "num-traits", "plotters-backend", @@ -4178,24 +4170,24 @@ dependencies = [ [[package]] name = "plotters-backend" -version = "0.3.5" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" [[package]] name = "plotters-svg" -version = "0.3.5" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" dependencies = [ "plotters-backend", ] [[package]] name = "portable-atomic" -version = "1.5.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bccab0e7fd7cc19f820a1c8c91720af652d0c88dc9664dd72aef2614f04af3b" +checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" [[package]] name = "powerfmt" @@ -4205,17 +4197,20 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "proc-macro-crate" -version = "3.1.0" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" dependencies = [ - "toml_edit 0.21.0", + "toml_edit", ] [[package]] @@ -4227,7 +4222,6 @@ dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", - "syn 1.0.109", "version_check", ] @@ -4244,9 +4238,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.81" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" dependencies = [ "unicode-ident", ] @@ -4319,9 +4313,9 @@ dependencies = [ [[package]] name = "pulp" -version = "0.18.9" +version = "0.18.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03457ac216146f43f921500bac4e892d5cd32b0479b929cbfc90f95cd6c599c2" +checksum = "a0a01a0dc67cf4558d279f0c25b0962bd08fc6dec0137699eae304103e882fe6" dependencies = [ "bytemuck", "libm", @@ -4331,16 +4325,17 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.2" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4ceeeeabace7857413798eb1ffa1e9c905a9946a57d81fb69b4b71c4d8eb3ad" +checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" dependencies = [ "bytes", "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash", + "rustc-hash 2.0.0", "rustls", + "socket2", "thiserror", "tokio", "tracing", @@ -4348,14 +4343,14 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.3" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddf517c03a109db8100448a4be38d498df8a210a99fe0e1b9eaf39e78c640efe" +checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" dependencies = [ "bytes", "rand", "ring", - "rustc-hash", + "rustc-hash 2.0.0", "rustls", "slab", "thiserror", @@ -4365,22 +4360,23 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.2" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9096629c45860fc7fb143e125eb826b5e721e10be3263160c7d60ca832cf8c46" +checksum = "e346e016eacfff12233c243718197ca12f148c84e1e84268a896699b41c71780" dependencies = [ + "cfg_aliases", "libc", "once_cell", - "socket2 0.5.5", + "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -4434,7 +4430,7 @@ dependencies = [ [[package]] name = "raw-collections" version = "0.1.0" -source = "git+https://github.com/dureuill/raw-collections.git#0ecd143c1707d237e3c4d749bc685418da2fccc2" +source = "git+https://github.com/dureuill/raw-collections.git#4ab9619207632c20f4e0c2e126d9d909cc58ef65" dependencies = [ "allocator-api2", "bumpalo", @@ -4500,38 +4496,29 @@ checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430" [[package]] name = "redox_syscall" -version = "0.2.16" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" dependencies = [ - "bitflags 1.3.2", -] - -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", ] [[package]] name = "redox_users" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", - "redox_syscall 0.2.16", + "libredox", "thiserror", ] [[package]] name = "regex" -version = "1.10.5" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -4541,9 +4528,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", @@ -4558,9 +4545,9 @@ checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rend" @@ -4573,9 +4560,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.5" +version = "0.12.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" dependencies = [ "base64 0.22.1", "bytes", @@ -4613,7 +4600,7 @@ dependencies = [ "wasm-streams", "web-sys", "webpki-roots", - "winreg", + "windows-registry", ] [[package]] @@ -4640,7 +4627,7 @@ source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -4660,9 +4647,9 @@ dependencies = [ [[package]] name = "rkyv" -version = "0.7.44" +version = "0.7.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cba464629b3394fc4dbc6f940ff8f5b4ff5c7aef40f29166fd4ad12acbc99c0" +checksum = "9008cd6385b9e161d8229e1f6549dd23c3d022f132a2ea37ac3a10ac4935779b" dependencies = [ "bitvec", "bytecheck", @@ -4678,9 +4665,9 @@ dependencies = [ [[package]] name = "rkyv_derive" -version = "0.7.44" +version = "0.7.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7dddfff8de25e6f62b9d64e6e432bf1c6736c57d20323e15ee10435fbda7c65" +checksum = "503d1d27590a2b0a3a4ca4c94755aa2875657196ecbf401a42eff41d7de532c0" dependencies = [ "proc-macro2", "quote", @@ -4711,9 +4698,9 @@ dependencies = [ [[package]] name = "rust_decimal" -version = "1.35.0" +version = "1.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1790d1c4c0ca81211399e0e0af16333276f375209e71a37b67698a373db5b47a" +checksum = "b082d80e3e3cc52b2ed634388d436fe1f4de6af5786cc2de9ba9737527bdf555" dependencies = [ "arrayvec", "borsh", @@ -4727,9 +4714,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" @@ -4738,19 +4725,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] -name = "rustc_version" -version = "0.4.0" +name = "rustc-hash" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ "semver", ] [[package]] name = "rustix" -version = "0.38.31" +version = "0.38.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +checksum = "aa260229e6538e52293eeb577aabd09945a09d6d9cc0fc550ed7529056c2e32a" dependencies = [ "bitflags 2.6.0", "errno", @@ -4761,9 +4754,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.11" +version = "0.23.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4828ea528154ae444e5a642dbb7d5623354030dc9822b83fd9bb79683c7399d0" +checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e" dependencies = [ "log", "once_cell", @@ -4776,25 +4769,24 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" dependencies = [ - "base64 0.22.1", "rustls-pki-types", ] [[package]] name = "rustls-pki-types" -version = "1.7.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d" +checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" [[package]] name = "rustls-webpki" -version = "0.102.5" +version = "0.102.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9a6fccd794a42c2c105b513a2f62bc3fd8f3ba57a4593677ceb0bd035164d78" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ "ring", "rustls-pki-types", @@ -4803,21 +4795,21 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" +checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" [[package]] name = "ryu" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "safetensors" -version = "0.4.2" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d980e6bfb34436fb0a81e42bc41af43f11805bbbca443e7f68e9faaabe669ed" +checksum = "44560c11236a6130a46ce36c836a62936dc81ebf8c36a37947423571be0e55b6" dependencies = [ "serde", "serde_json", @@ -4860,9 +4852,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.18" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" dependencies = [ "serde", ] @@ -4875,9 +4867,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.210" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" dependencies = [ "serde_derive", ] @@ -4893,20 +4885,20 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.210" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] name = "serde_json" -version = "1.0.128" +version = "1.0.132" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" +checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" dependencies = [ "indexmap", "itoa", @@ -4926,9 +4918,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.6" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0" +checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1" dependencies = [ "serde", ] @@ -4995,9 +4987,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook-registry" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" dependencies = [ "libc", ] @@ -5010,15 +5002,15 @@ checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" [[package]] name = "simdutf8" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "similar" -version = "2.2.1" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "420acb44afdae038210c99e69aae24109f32f15500aa708e81d46c9f29d55fcf" +checksum = "1de1d4f81173b03af4c0cbed3c898f6bff5b870e4a7f5d6f4057d62a7a4b686e" [[package]] name = "simple_asn1" @@ -5046,9 +5038,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "slab" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" dependencies = [ "autocfg", ] @@ -5092,22 +5084,12 @@ dependencies = [ [[package]] name = "socket2" -version = "0.4.9" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" dependencies = [ "libc", - "winapi", -] - -[[package]] -name = "socket2" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9" -dependencies = [ - "libc", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -5176,31 +5158,31 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "strum" -version = "0.26.2" +version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" dependencies = [ "strum_macros", ] [[package]] name = "strum_macros" -version = "0.26.2" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck 0.4.1", + "heck", "proc-macro2", "quote", "rustversion", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] name = "subtle" -version = "2.5.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" @@ -5215,9 +5197,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.60" +version = "2.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3" +checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" dependencies = [ "proc-macro2", "quote", @@ -5233,7 +5215,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -5241,6 +5223,9 @@ name = "sync_wrapper" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +dependencies = [ + "futures-core", +] [[package]] name = "synchronoise" @@ -5259,7 +5244,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -5299,9 +5284,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tar" -version = "0.4.41" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb797dad5fb5b76fcf519e702f4a589483b5ef06567f160c392832c1f5e44909" +checksum = "4ff6c40d3aedb5e06b57c6f669ad17ab063dd1e63d977c6a88e7f4dfa4f04020" dependencies = [ "filetime", "libc", @@ -5319,14 +5304,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.10.1" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" dependencies = [ "cfg-if", "fastrand", + "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -5349,22 +5335,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.61" +version = "1.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +checksum = "5d11abd9594d9b38965ef50805c5e469ca9cc6f197f883f717e0269a3057b3d5" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.61" +version = "1.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -5389,7 +5375,7 @@ dependencies = [ "fancy-regex 0.12.0", "lazy_static", "parking_lot", - "rustc-hash", + "rustc-hash 1.1.0", ] [[package]] @@ -5446,9 +5432,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" dependencies = [ "tinyvec_macros", ] @@ -5491,32 +5477,31 @@ dependencies = [ [[package]] name = "tokio" -version = "1.38.0" +version = "1.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" dependencies = [ "backtrace", "bytes", "libc", "mio", - "num_cpus", "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.5.5", + "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "tokio-macros" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -5532,9 +5517,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.11" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" dependencies = [ "bytes", "futures-core", @@ -5545,75 +5530,43 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.14" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335" +checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e" dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.22.15", + "toml_edit", ] [[package]] name = "toml_datetime" -version = "0.6.6" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.21.0" +version = "0.22.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34d383cd00a163b4a5b85053df514d45bc330f6de7737edfe0a93311d1eaa03" -dependencies = [ - "indexmap", - "toml_datetime", - "winnow 0.5.40", -] - -[[package]] -name = "toml_edit" -version = "0.22.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59a3a72298453f564e2b111fa896f8d07fabb36f51f06d7e875fc5e0b5a3ef1" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" dependencies = [ "indexmap", "serde", "serde_spanned", "toml_datetime", - "winnow 0.6.13", + "winnow", ] -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "pin-project", - "pin-project-lite", - "tokio", - "tower-layer", - "tower-service", -] - -[[package]] -name = "tower-layer" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" - [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -5629,9 +5582,9 @@ dependencies = [ [[package]] name = "tracing-actix-web" -version = "0.7.11" +version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ee9e39a66d9b615644893ffc1704d2a89b5b315b7fd0228ad3182ca9a306b19" +checksum = "6b87073920bcce23e9f5cb0d2671e9f01d6803bb5229c159b2f5ce6806d73ffc" dependencies = [ "actix-web", "mutually_exclusive_features", @@ -5648,7 +5601,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -5727,9 +5680,9 @@ dependencies = [ [[package]] name = "try-lock" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "typenum" @@ -5739,9 +5692,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "ucd-trie" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9" +checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" [[package]] name = "unescaper" @@ -5754,18 +5707,15 @@ dependencies = [ [[package]] name = "unicase" -version = "2.6.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" -dependencies = [ - "version_check", -] +checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df" [[package]] name = "unicode-bidi" -version = "0.3.15" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" +checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" [[package]] name = "unicode-blocks" @@ -5775,15 +5725,15 @@ checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" [[package]] name = "unicode-ident" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] @@ -5799,15 +5749,15 @@ dependencies = [ [[package]] name = "unicode-segmentation" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.11" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "unicode_categories" @@ -5823,9 +5773,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "2.10.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72139d247e5f97a3eff96229a7ae85ead5328a39efe76f8bf5a06313d505b6ea" +checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a" dependencies = [ "base64 0.22.1", "flate2", @@ -5860,21 +5810,21 @@ checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" [[package]] name = "utf8-width" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1" +checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3" [[package]] name = "utf8parse" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ "getrandom", "serde", @@ -5894,24 +5844,24 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vergen" -version = "9.0.0" +version = "9.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c32e7318e93a9ac53693b6caccfb05ff22e04a44c7cf8a279051f24c09da286f" +checksum = "349ed9e45296a581f455bc18039878f409992999bc1d5da12a6800eb18c8752f" dependencies = [ "anyhow", - "derive_builder 0.20.0", + "derive_builder 0.20.2", "rustversion", "vergen-lib", ] [[package]] name = "vergen-git2" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62c52cd2b2b8b7ec75fc20111b3022ac3ff83e4fc14b9497cfcfd39c54f9c67" +checksum = "e771aff771c0d7c2f42e434e2766d304d917e29b40f0424e8faaaa936bbc3f29" dependencies = [ "anyhow", - "derive_builder 0.20.0", + "derive_builder 0.20.2", "git2", "rustversion", "time", @@ -5921,21 +5871,20 @@ dependencies = [ [[package]] name = "vergen-lib" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e06bee42361e43b60f363bad49d63798d0f42fb1768091812270eca00c784720" +checksum = "229eaddb0050920816cf051e619affaf18caa3dd512de8de5839ccbc8e53abb0" dependencies = [ "anyhow", - "derive_builder 0.20.0", - "getset", + "derive_builder 0.20.2", "rustversion", ] [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "walkdir" @@ -5975,34 +5924,35 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.92" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" dependencies = [ "cfg-if", + "once_cell", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.92" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.37" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" +checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" dependencies = [ "cfg-if", "js-sys", @@ -6012,9 +5962,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6022,28 +5972,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" [[package]] name = "wasm-streams" -version = "0.4.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" dependencies = [ "futures-util", "js-sys", @@ -6054,9 +6004,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.64" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" dependencies = [ "js-sys", "wasm-bindgen", @@ -6064,9 +6014,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.26.1" +version = "0.26.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009" +checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958" dependencies = [ "rustls-pki-types", ] @@ -6099,11 +6049,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.6" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "winapi", + "windows-sys 0.59.0", ] [[package]] @@ -6119,7 +6069,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" dependencies = [ "windows-core", - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -6128,16 +6078,37 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] -name = "windows-sys" -version = "0.45.0" +name = "windows-registry" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" dependencies = [ - "windows-targets 0.42.2", + "windows-result", + "windows-strings", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result", + "windows-targets 0.52.6", ] [[package]] @@ -6146,7 +6117,7 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets 0.48.1", + "windows-targets 0.48.5", ] [[package]] @@ -6155,217 +6126,157 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", ] [[package]] name = "windows-targets" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] name = "windows-targets" -version = "0.48.1" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", -] - -[[package]] -name = "windows-targets" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" -dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] -name = "windows_i686_gnu" -version = "0.52.4" +name = "windows_i686_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" - -[[package]] -name = "windows_i686_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" -version = "0.42.2" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.5.40" +version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" dependencies = [ "memchr", ] -[[package]] -name = "winnow" -version = "0.6.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" -dependencies = [ - "memchr", -] - -[[package]] -name = "winreg" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "wiremock" -version = "0.6.0" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec874e1eef0df2dcac546057fe5e29186f09c378181cd7b635b4b7bcc98e9d81" +checksum = "7fff469918e7ca034884c7fd8f93fe27bacb7fcb599fd879df6c7b429a29b646" dependencies = [ "assert-json-diff", "async-trait", - "base64 0.21.7", + "base64 0.22.1", "deadpool", "futures", "http 1.1.0", @@ -6443,9 +6354,9 @@ dependencies = [ [[package]] name = "yoke" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e71b2e4f287f467794c671e2b8f8a5f3716b3c829079a1c44740148eff07e4" +checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" dependencies = [ "serde", "stable_deref_trait", @@ -6455,54 +6366,55 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e6936f0cce458098a201c245a11bef556c6a0181129c7034d10d76d1ec3a2b8" +checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", "synstructure", ] [[package]] name = "zerocopy" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ + "byteorder", "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.32" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] name = "zerofrom" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "655b0814c5c0b19ade497851070c640773304939a6c0fd5f5fb43da0696d05b7" +checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6a647510471d372f2e6c2e6b7219e44d8c574d24fdc11c610a61455782f18c3" +checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", "synstructure", ] @@ -6523,7 +6435,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.85", ] [[package]] @@ -6543,9 +6455,9 @@ dependencies = [ [[package]] name = "zip" -version = "2.1.3" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "775a2b471036342aa69bc5a602bc889cb0a06cda00477d0c69566757d5553d39" +checksum = "dc5e4288ea4057ae23afc69a4472434a87a2495cafce6632fd1c4ec9f5cf3494" dependencies = [ "aes", "arbitrary", @@ -6595,18 +6507,18 @@ dependencies = [ [[package]] name = "zstd-safe" -version = "7.2.0" +version = "7.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa556e971e7b568dc775c136fc9de8c779b1c2fc3a63defaafadffdbd3181afa" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.10+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 14bbcfe53..fdf213a6b 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -33,13 +33,11 @@ use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::new::indexer::{ self, retrieve_or_guess_primary_key, UpdateByFunction, }; -use meilisearch_types::milli::update::{ - IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, -}; +use meilisearch_types::milli::update::{IndexDocumentsMethod, Settings as MilliSettings}; use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; -use meilisearch_types::milli::{self, Filter, Object}; +use meilisearch_types::milli::{self, Filter}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; @@ -50,7 +48,7 @@ use uuid::Uuid; use crate::autobatcher::{self, BatchKind}; use crate::utils::{self, swap_index_uid_in_task}; -use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId}; +use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId}; /// Represents a combination of tasks that can all be processed at the same time. /// diff --git a/meili-snap/Cargo.toml b/meili-snap/Cargo.toml index e86feabd9..6c68e563c 100644 --- a/meili-snap/Cargo.toml +++ b/meili-snap/Cargo.toml @@ -11,6 +11,6 @@ edition.workspace = true license.workspace = true [dependencies] -insta = { version = "^1.39.0", features = ["json", "redactions"] } +insta = { version = "=1.39.0", features = ["json", "redactions"] } md5 = "0.7.0" once_cell = "1.19" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 46633bdec..df463c902 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -94,10 +94,12 @@ tracing = "0.1.40" ureq = { version = "2.10.0", features = ["json"] } url = "2.5.2" rayon-par-bridge = "0.1.0" -hashbrown = "0.14.5" +hashbrown = "0.15.0" raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" } bumpalo = "3.16.0" thread_local = "1.1.8" +allocator-api2 = "0.2.18" +rustc-hash = "2.0.0" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 220567208..b7da39878 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -83,6 +83,8 @@ pub fn writer_into_reader( grenad::Reader::new(BufReader::new(file)).map_err(Into::into) } +/// # Safety +/// We use memory mapping inside. So, according to the Rust community, it's unsafe. pub unsafe fn as_cloneable_grenad( reader: &grenad::Reader>, ) -> Result> { diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index d63180ba1..af6e2215c 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -1,42 +1,33 @@ -use std::fs::File; use std::marker::PhantomData; +use std::sync::atomic::{AtomicUsize, Ordering}; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; -use grenad::Merger; use heed::types::Bytes; -use memmap2::Mmap; use roaring::RoaringBitmap; use super::extract::FacetKind; use super::StdResult; -use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; +use crate::index::main_key::DOCUMENTS_IDS_KEY; use crate::update::new::KvReaderFieldId; -use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{DocumentId, Index}; /// The capacity of the channel is currently in number of messages. -pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) { +pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) { let (sender, receiver) = crossbeam_channel::bounded(cap); ( - MergerSender { + ExtractorSender { sender, send_count: Default::default(), writer_contentious_count: Default::default(), - merger_contentious_count: Default::default(), + extractor_contentious_count: Default::default(), }, WriterReceiver(receiver), ) } -/// The capacity of the channel is currently in number of messages. -pub fn extractors_merger_channels(cap: usize) -> (ExtractorSender, MergerReceiver) { - let (sender, receiver) = crossbeam_channel::bounded(cap); - (ExtractorSender(sender), MergerReceiver(receiver)) -} - -pub enum KeyValueEntry { - SmallInMemory { key_length: usize, data: Box<[u8]> }, - LargeOnDisk { key: Box<[u8]>, value: Mmap }, +pub struct KeyValueEntry { + pub key_length: usize, + pub data: Box<[u8]>, } impl KeyValueEntry { @@ -44,32 +35,22 @@ impl KeyValueEntry { let mut data = Vec::with_capacity(key.len() + value.len()); data.extend_from_slice(key); data.extend_from_slice(value); - KeyValueEntry::SmallInMemory { key_length: key.len(), data: data.into_boxed_slice() } + KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() } } pub fn from_small_key_bitmap(key: &[u8], bitmap: RoaringBitmap) -> Self { let mut data = Vec::with_capacity(key.len() + bitmap.serialized_size()); data.extend_from_slice(key); bitmap.serialize_into(&mut data).unwrap(); - KeyValueEntry::SmallInMemory { key_length: key.len(), data: data.into_boxed_slice() } - } - - pub fn from_large_key_value(key: &[u8], value: Mmap) -> Self { - KeyValueEntry::LargeOnDisk { key: key.to_vec().into_boxed_slice(), value } + KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() } } pub fn key(&self) -> &[u8] { - match self { - KeyValueEntry::SmallInMemory { key_length, data } => &data.as_ref()[..*key_length], - KeyValueEntry::LargeOnDisk { key, value: _ } => key.as_ref(), - } + &self.data[..self.key_length] } pub fn value(&self) -> &[u8] { - match self { - KeyValueEntry::SmallInMemory { key_length, data } => &data.as_ref()[*key_length..], - KeyValueEntry::LargeOnDisk { key: _, value } => value.as_ref(), - } + &self.data[self.key_length..] } } @@ -92,37 +73,6 @@ pub enum EntryOperation { Write(KeyValueEntry), } -pub struct DocumentEntry { - docid: DocumentId, - content: Box<[u8]>, -} - -impl DocumentEntry { - pub fn new_uncompressed(docid: DocumentId, content: Box) -> Self { - DocumentEntry { docid, content: content.into() } - } - - pub fn new_compressed(docid: DocumentId, content: Box<[u8]>) -> Self { - DocumentEntry { docid, content } - } - - pub fn key(&self) -> [u8; 4] { - self.docid.to_be_bytes() - } - - pub fn content(&self) -> &[u8] { - &self.content - } -} - -pub struct DocumentDeletionEntry(DocumentId); - -impl DocumentDeletionEntry { - pub fn key(&self) -> [u8; 4] { - self.0.to_be_bytes() - } -} - pub struct WriterOperation { database: Database, entry: EntryOperation, @@ -206,34 +156,32 @@ impl IntoIterator for WriterReceiver { } } -pub struct MergerSender { +pub struct ExtractorSender { sender: Sender, - /// The number of message we send in total in the channel. - send_count: std::cell::Cell, + /// The number of message we sent in total in the channel. + send_count: AtomicUsize, /// The number of times we sent something in a channel that was full. - writer_contentious_count: std::cell::Cell, + writer_contentious_count: AtomicUsize, /// The number of times we sent something in a channel that was empty. - merger_contentious_count: std::cell::Cell, + extractor_contentious_count: AtomicUsize, } -impl Drop for MergerSender { +impl Drop for ExtractorSender { fn drop(&mut self) { + let send_count = *self.send_count.get_mut(); + let writer_contentious_count = *self.writer_contentious_count.get_mut(); + let extractor_contentious_count = *self.extractor_contentious_count.get_mut(); eprintln!( - "Merger channel stats: {} sends, {} writer contentions ({}%), {} merger contentions ({}%)", - self.send_count.get(), - self.writer_contentious_count.get(), - (self.writer_contentious_count.get() as f32 / self.send_count.get() as f32) * 100.0, - self.merger_contentious_count.get(), - (self.merger_contentious_count.get() as f32 / self.send_count.get() as f32) * 100.0 + "Extractor channel stats: {send_count} sends, \ + {writer_contentious_count} writer contentions ({}%), \ + {extractor_contentious_count} extractor contentions ({}%)", + (writer_contentious_count as f32 / send_count as f32) * 100.0, + (extractor_contentious_count as f32 / send_count as f32) * 100.0 ) } } -impl MergerSender { - pub fn main(&self) -> MainSender<'_> { - MainSender(self) - } - +impl ExtractorSender { pub fn docids(&self) -> WordDocidsSender<'_, D> { WordDocidsSender { sender: self, _marker: PhantomData } } @@ -263,12 +211,12 @@ impl MergerSender { fn send(&self, op: WriterOperation) -> StdResult<(), SendError<()>> { if self.sender.is_full() { - self.writer_contentious_count.set(self.writer_contentious_count.get() + 1); + self.writer_contentious_count.fetch_add(1, Ordering::SeqCst); } if self.sender.is_empty() { - self.merger_contentious_count.set(self.merger_contentious_count.get() + 1); + self.extractor_contentious_count.fetch_add(1, Ordering::SeqCst); } - self.send_count.set(self.send_count.get() + 1); + self.send_count.fetch_add(1, Ordering::SeqCst); match self.sender.send(op) { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), @@ -276,129 +224,48 @@ impl MergerSender { } } -pub struct MainSender<'a>(&'a MergerSender); - -impl MainSender<'_> { - pub fn write_words_fst(&self, value: Mmap) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value( - WORDS_FST_KEY.as_bytes(), - value, - )); - match self.0.send(WriterOperation { database: Database::Main, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } - - pub fn write_words_prefixes_fst(&self, value: Mmap) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value( - WORDS_PREFIXES_FST_KEY.as_bytes(), - value, - )); - match self.0.send(WriterOperation { database: Database::Main, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } - - pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.0.send(WriterOperation { database: Database::Main, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } -} - pub enum ExactWordDocids {} pub enum FidWordCountDocids {} pub enum WordDocids {} pub enum WordFidDocids {} pub enum WordPairProximityDocids {} pub enum WordPositionDocids {} -pub enum FacetDocids {} pub trait DatabaseType { const DATABASE: Database; } -pub trait MergerOperationType { - fn new_merger_operation(merger: Merger) -> MergerOperation; -} - impl DatabaseType for ExactWordDocids { const DATABASE: Database = Database::ExactWordDocids; } -impl MergerOperationType for ExactWordDocids { - fn new_merger_operation(merger: Merger) -> MergerOperation { - MergerOperation::ExactWordDocidsMerger(merger) - } -} - impl DatabaseType for FidWordCountDocids { const DATABASE: Database = Database::FidWordCountDocids; } -impl MergerOperationType for FidWordCountDocids { - fn new_merger_operation(merger: Merger) -> MergerOperation { - MergerOperation::FidWordCountDocidsMerger(merger) - } -} - impl DatabaseType for WordDocids { const DATABASE: Database = Database::WordDocids; } -impl MergerOperationType for WordDocids { - fn new_merger_operation(merger: Merger) -> MergerOperation { - MergerOperation::WordDocidsMerger(merger) - } -} - impl DatabaseType for WordFidDocids { const DATABASE: Database = Database::WordFidDocids; } -impl MergerOperationType for WordFidDocids { - fn new_merger_operation(merger: Merger) -> MergerOperation { - MergerOperation::WordFidDocidsMerger(merger) - } -} - impl DatabaseType for WordPairProximityDocids { const DATABASE: Database = Database::WordPairProximityDocids; } -impl MergerOperationType for WordPairProximityDocids { - fn new_merger_operation(merger: Merger) -> MergerOperation { - MergerOperation::WordPairProximityDocidsMerger(merger) - } -} - impl DatabaseType for WordPositionDocids { const DATABASE: Database = Database::WordPositionDocids; } -impl MergerOperationType for WordPositionDocids { - fn new_merger_operation(merger: Merger) -> MergerOperation { - MergerOperation::WordPositionDocidsMerger(merger) - } -} - -impl MergerOperationType for FacetDocids { - fn new_merger_operation(merger: Merger) -> MergerOperation { - MergerOperation::FacetDocidsMerger(merger) - } -} - pub trait DocidsSender { fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>; fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>; } pub struct WordDocidsSender<'a, D> { - sender: &'a MergerSender, + sender: &'a ExtractorSender, _marker: PhantomData, } @@ -421,7 +288,7 @@ impl DocidsSender for WordDocidsSender<'_, D> { } pub struct FacetDocidsSender<'a> { - sender: &'a MergerSender, + sender: &'a ExtractorSender, } impl DocidsSender for FacetDocidsSender<'_> { @@ -456,7 +323,7 @@ impl DocidsSender for FacetDocidsSender<'_> { } pub struct FacetSearchableSender<'a> { - sender: &'a MergerSender, + sender: &'a ExtractorSender, } impl FacetSearchableSender<'_> { @@ -481,25 +348,9 @@ impl FacetSearchableSender<'_> { Err(SendError(_)) => Err(SendError(())), } } - - pub fn write_fst(&self, key: &[u8], value: Mmap) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value(key, value)); - match self.sender.send(WriterOperation { database: Database::FacetIdStringFst, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } - - pub fn delete_fst(&self, key: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send(WriterOperation { database: Database::FacetIdStringFst, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } } -pub struct DocumentsSender<'a>(&'a MergerSender); +pub struct DocumentsSender<'a>(&'a ExtractorSender); impl DocumentsSender<'_> { /// TODO do that efficiently @@ -542,86 +393,3 @@ impl DocumentsSender<'_> { } } } - -pub enum MergerOperation { - ExactWordDocidsMerger(Merger), - FidWordCountDocidsMerger(Merger), - WordDocidsMerger(Merger), - WordFidDocidsMerger(Merger), - WordPairProximityDocidsMerger(Merger), - WordPositionDocidsMerger(Merger), - FacetDocidsMerger(Merger), - DeleteDocument { docid: DocumentId, external_id: String }, - InsertDocument { docid: DocumentId, external_id: String, document: Box }, - FinishedDocument, -} - -pub struct MergerReceiver(Receiver); - -impl IntoIterator for MergerReceiver { - type Item = MergerOperation; - type IntoIter = IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } -} - -pub struct ExtractorSender(Sender); - -impl ExtractorSender { - pub fn document_sender(&self) -> DocumentSender<'_> { - DocumentSender(Some(&self.0)) - } - - pub fn send_searchable( - &self, - merger: Merger, - ) -> StdResult<(), SendError<()>> { - match self.0.send(D::new_merger_operation(merger)) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } -} - -pub struct DocumentSender<'a>(Option<&'a Sender>); - -impl DocumentSender<'_> { - pub fn insert( - &self, - docid: DocumentId, - external_id: String, - document: Box, - ) -> StdResult<(), SendError<()>> { - let sender = self.0.unwrap(); - match sender.send(MergerOperation::InsertDocument { docid, external_id, document }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } - - pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> { - let sender = self.0.unwrap(); - match sender.send(MergerOperation::DeleteDocument { docid, external_id }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } - - pub fn finish(mut self) -> StdResult<(), SendError<()>> { - let sender = self.0.take().unwrap(); - match sender.send(MergerOperation::FinishedDocument) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } -} - -impl Drop for DocumentSender<'_> { - fn drop(&mut self) { - if let Some(sender) = self.0.take() { - let _ = sender.send(MergerOperation::FinishedDocument); - } - } -} diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 2fbe427f3..a366435d8 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -1,299 +1,611 @@ -use std::fmt::Write as _; -use std::mem; -use std::num::NonZeroUsize; +//! # How the Merge Algorithm works +//! +//! Each extractor create #Threads caches and balances the entries +//! based on the hash of the keys. To do that we can use the +//! hashbrown::hash_map::RawEntryBuilderMut::from_key_hashed_nocheck. +//! This way we can compute the hash on our own, decide on the cache to +//! target, and insert it into the right HashMap. +//! +//! #Thread -> caches +//! t1 -> [t1c1, t1c2, t1c3] +//! t2 -> [t2c1, t2c2, t2c3] +//! t3 -> [t3c1, t3c2, t3c3] +//! +//! When the extractors are done filling the caches, we want to merge +//! the content of all the caches. We do a transpose and each thread is +//! assigned the associated cache. By doing that we know that every key +//! is put in a known cache and will collide with keys in the other +//! caches of the other threads. +//! +//! #Thread -> caches +//! t1 -> [t1c1, t2c1, t3c1] +//! t2 -> [t1c2, t2c2, t3c2] +//! t3 -> [t1c3, t2c3, t3c3] +//! +//! When we encountered a miss in the other caches we must still try +//! to find it in the spilled entries. This is the reason why we use +//! a grenad sorter/reader so that we can seek "efficiently" for a key. +//! +//! ## More Detailled Algorithm +//! +//! Each sub-cache has an in-memory HashMap and some spilled +//! lexicographically ordered entries on disk (grenad). We first iterate +//! over the spilled entries of all the caches at once by using a merge +//! join algorithm. This algorithm will merge the entries by using its +//! merge function. +//! +//! Everytime a merged entry is emited by the merge join algorithm we also +//! fetch the value from the other in-memory caches (HashMaps) to finish +//! the merge. Everytime we retrieve an entry from the in-memory caches +//! we mark them with a tombstone for later. +//! +//! Once we are done with the spilled entries we iterate over the in-memory +//! HashMaps. We iterate over the first one, retrieve the content from the +//! other onces and mark them with a tombstone again. We also make sure +//! to ignore the dead (tombstoned) ones. +//! +//! ## Memory Control +//! +//! We can detect that there are no more memory available when the +//! bump allocator reaches a threshold. When this is the case we +//! freeze the cache. There is one bump allocator by thread and the +//! memory must be well balanced as we manage one type of extraction +//! at a time with well-balanced documents. +//! +//! It means that the unknown new keys added to the +//! cache are directly spilled to disk: basically a key followed by a +//! del/add bitmap. For the known keys we can keep modifying them in +//! the materialized version in the cache: update the del/add bitmaps. +//! +//! For now we can use a grenad sorter for spilling even thought I think +//! it's not the most efficient way (too many files open, sorting entries). -use grenad::{MergeFunction, Sorter}; -use roaring::bitmap::Statistics; +use std::cmp::Ordering; +use std::collections::binary_heap::PeekMut; +use std::collections::BinaryHeap; +use std::fs::File; +use std::hash::BuildHasher; +use std::io::BufReader; +use std::{io, iter, mem}; + +use bumpalo::Bump; +use grenad::ReaderCursor; +use hashbrown::hash_map::RawEntryMut; +use hashbrown::HashMap; +use raw_collections::map::FrozenMap; use roaring::RoaringBitmap; -use smallvec::SmallVec; +use rustc_hash::FxBuildHasher; -use super::lru::Lru; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; -use crate::CboRoaringBitmapCodec; +use crate::update::new::indexer::document_changes::MostlySend; +use crate::update::new::KvReaderDelAdd; +use crate::update::MergeDeladdCboRoaringBitmaps; +use crate::{CboRoaringBitmapCodec, Result}; -const KEY_SIZE: usize = 12; +/// A cache that stores bytes keys associated to CboDelAddRoaringBitmaps. +/// +/// Internally balances the content over `N` buckets for future merging. +pub struct BalancedCaches<'extractor> { + hasher: FxBuildHasher, + alloc: &'extractor Bump, + max_memory: Option, + caches: InnerCaches<'extractor>, +} -#[derive(Debug)] -pub struct CboCachedSorter { - cache: Lru, DelAddRoaringBitmap>, - sorter: Sorter, +enum InnerCaches<'extractor> { + Normal(NormalCaches<'extractor>), + Spilling(SpillingCaches<'extractor>), +} + +impl<'extractor> BalancedCaches<'extractor> { + pub fn new_in(buckets: usize, max_memory: Option, alloc: &'extractor Bump) -> Self { + Self { + hasher: FxBuildHasher, + max_memory, + caches: InnerCaches::Normal(NormalCaches { + caches: iter::repeat_with(|| HashMap::with_hasher_in(FxBuildHasher, alloc)) + .take(buckets) + .collect(), + }), + alloc, + } + } + + fn buckets(&self) -> usize { + match &self.caches { + InnerCaches::Normal(caches) => caches.caches.len(), + InnerCaches::Spilling(caches) => caches.caches.len(), + } + } + + pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> Result<()> { + if self.max_memory.map_or(false, |mm| self.alloc.allocated_bytes() >= mm) { + self.start_spilling()?; + } + + let buckets = self.buckets(); + match &mut self.caches { + InnerCaches::Normal(normal) => { + normal.insert_del_u32(&self.hasher, self.alloc, buckets, key, n); + Ok(()) + } + InnerCaches::Spilling(spilling) => { + spilling.insert_del_u32(&self.hasher, buckets, key, n) + } + } + } + + pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> Result<()> { + if self.max_memory.map_or(false, |mm| self.alloc.allocated_bytes() >= mm) { + self.start_spilling()?; + } + + let buckets = self.buckets(); + match &mut self.caches { + InnerCaches::Normal(normal) => { + normal.insert_add_u32(&self.hasher, self.alloc, buckets, key, n); + Ok(()) + } + InnerCaches::Spilling(spilling) => { + spilling.insert_add_u32(&self.hasher, buckets, key, n) + } + } + } + + /// Make sure the cache is no longer allocating data + /// and writes every new and unknow entry to disk. + fn start_spilling(&mut self) -> Result<()> { + let BalancedCaches { hasher: _, alloc, max_memory: _, caches } = self; + + if let InnerCaches::Normal(normal_caches) = caches { + eprintln!( + "We are spilling after we allocated {} bytes on thread #{}", + alloc.allocated_bytes(), + rayon::current_thread_index().unwrap_or(0) + ); + + let allocated: usize = normal_caches.caches.iter().map(|m| m.allocation_size()).sum(); + eprintln!("The last allocated HasMap took {allocated} bytes"); + + let dummy = NormalCaches { caches: Vec::new() }; + let NormalCaches { caches: cache_maps } = mem::replace(normal_caches, dummy); + *caches = InnerCaches::Spilling(SpillingCaches::from_cache_maps(cache_maps)); + } + + Ok(()) + } + + pub fn freeze(&mut self) -> Result>> { + match &mut self.caches { + InnerCaches::Normal(NormalCaches { caches }) => caches + .iter_mut() + .enumerate() + .map(|(bucket, map)| { + Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() }) + }) + .collect(), + InnerCaches::Spilling(SpillingCaches { caches, spilled_entries, .. }) => caches + .iter_mut() + .zip(mem::take(spilled_entries)) + .enumerate() + .map(|(bucket, (map, sorter))| { + let spilled = sorter + .into_reader_cursors()? + .into_iter() + .map(ReaderCursor::into_inner) + .map(BufReader::new) + .map(|bufreader| grenad::Reader::new(bufreader).map_err(Into::into)) + .collect::>()?; + Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled }) + }) + .collect(), + } + } +} + +unsafe impl MostlySend for BalancedCaches<'_> {} + +struct NormalCaches<'extractor> { + caches: Vec>, +} + +impl<'extractor> NormalCaches<'extractor> { + pub fn insert_del_u32( + &mut self, + hasher: &FxBuildHasher, + alloc: &'extractor Bump, + buckets: usize, + key: &[u8], + n: u32, + ) { + let hash = hasher.hash_one(key); + let bucket = compute_bucket_from_hash(buckets, hash); + + match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) { + RawEntryMut::Occupied(mut entry) => { + entry.get_mut().del.get_or_insert_with(RoaringBitmap::default).insert(n); + } + RawEntryMut::Vacant(entry) => { + entry.insert_hashed_nocheck( + hash, + alloc.alloc_slice_copy(key), + DelAddRoaringBitmap::new_del_u32(n), + ); + } + } + } + + pub fn insert_add_u32( + &mut self, + hasher: &FxBuildHasher, + alloc: &'extractor Bump, + buckets: usize, + key: &[u8], + n: u32, + ) { + let hash = hasher.hash_one(key); + let bucket = compute_bucket_from_hash(buckets, hash); + match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) { + RawEntryMut::Occupied(mut entry) => { + entry.get_mut().add.get_or_insert_with(RoaringBitmap::default).insert(n); + } + RawEntryMut::Vacant(entry) => { + entry.insert_hashed_nocheck( + hash, + alloc.alloc_slice_copy(key), + DelAddRoaringBitmap::new_add_u32(n), + ); + } + } + } +} + +struct SpillingCaches<'extractor> { + caches: Vec>, + spilled_entries: Vec>, deladd_buffer: Vec, cbo_buffer: Vec, - total_insertions: usize, - fitted_in_key: usize, } -impl CboCachedSorter { - pub fn new(cap: NonZeroUsize, sorter: Sorter) -> Self { - CboCachedSorter { - cache: Lru::new(cap), - sorter, +impl<'extractor> SpillingCaches<'extractor> { + fn from_cache_maps( + caches: Vec< + HashMap<&'extractor [u8], DelAddRoaringBitmap, FxBuildHasher, &'extractor Bump>, + >, + ) -> SpillingCaches<'extractor> { + SpillingCaches { + spilled_entries: iter::repeat_with(|| { + let mut builder = grenad::SorterBuilder::new(MergeDeladdCboRoaringBitmaps); + builder.dump_threshold(0); + builder.allow_realloc(false); + builder.build() + }) + .take(caches.len()) + .collect(), + caches, deladd_buffer: Vec::new(), cbo_buffer: Vec::new(), - total_insertions: 0, - fitted_in_key: 0, } } -} -impl CboCachedSorter { - pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { - match self.cache.get_mut(key) { - Some(DelAddRoaringBitmap { del, add: _ }) => { - del.get_or_insert_with(RoaringBitmap::default).insert(n); - } - None => { - self.total_insertions += 1; - self.fitted_in_key += (key.len() <= KEY_SIZE) as usize; - let value = DelAddRoaringBitmap::new_del_u32(n); - if let Some((key, deladd)) = self.cache.push(key.into(), value) { - self.write_entry(key, deladd)?; - } - } - } - - Ok(()) - } - - pub fn insert_del( + pub fn insert_del_u32( &mut self, + hasher: &FxBuildHasher, + buckets: usize, key: &[u8], - bitmap: RoaringBitmap, - ) -> grenad::Result<(), MF::Error> { - match self.cache.get_mut(key) { - Some(DelAddRoaringBitmap { del, add: _ }) => { - *del.get_or_insert_with(RoaringBitmap::default) |= bitmap; + n: u32, + ) -> Result<()> { + let hash = hasher.hash_one(key); + let bucket = compute_bucket_from_hash(buckets, hash); + match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) { + RawEntryMut::Occupied(mut entry) => { + entry.get_mut().del.get_or_insert_with(RoaringBitmap::default).insert(n); + Ok(()) } - None => { - self.total_insertions += 1; - self.fitted_in_key += (key.len() <= KEY_SIZE) as usize; - let value = DelAddRoaringBitmap::new_del(bitmap); - if let Some((key, deladd)) = self.cache.push(key.into(), value) { - self.write_entry(key, deladd)?; - } + RawEntryMut::Vacant(_entry) => { + let deladd = DelAddRoaringBitmap::new_del_u32(n); + spill_entry_to_sorter( + &mut self.spilled_entries[bucket], + &mut self.deladd_buffer, + &mut self.cbo_buffer, + key, + deladd, + ) } } - - Ok(()) } - pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { - match self.cache.get_mut(key) { - Some(DelAddRoaringBitmap { del: _, add }) => { - add.get_or_insert_with(RoaringBitmap::default).insert(n); - } - None => { - self.total_insertions += 1; - self.fitted_in_key += (key.len() <= KEY_SIZE) as usize; - let value = DelAddRoaringBitmap::new_add_u32(n); - if let Some((key, deladd)) = self.cache.push(key.into(), value) { - self.write_entry(key, deladd)?; - } - } - } - - Ok(()) - } - - pub fn insert_add( + pub fn insert_add_u32( &mut self, + hasher: &FxBuildHasher, + buckets: usize, key: &[u8], - bitmap: RoaringBitmap, - ) -> grenad::Result<(), MF::Error> { - match self.cache.get_mut(key) { - Some(DelAddRoaringBitmap { del: _, add }) => { - *add.get_or_insert_with(RoaringBitmap::default) |= bitmap; + n: u32, + ) -> Result<()> { + let hash = hasher.hash_one(key); + let bucket = compute_bucket_from_hash(buckets, hash); + match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) { + RawEntryMut::Occupied(mut entry) => { + entry.get_mut().add.get_or_insert_with(RoaringBitmap::default).insert(n); + Ok(()) } - None => { - self.total_insertions += 1; - self.fitted_in_key += (key.len() <= KEY_SIZE) as usize; - let value = DelAddRoaringBitmap::new_add(bitmap); - if let Some((key, deladd)) = self.cache.push(key.into(), value) { - self.write_entry(key, deladd)?; + RawEntryMut::Vacant(_entry) => { + let deladd = DelAddRoaringBitmap::new_add_u32(n); + spill_entry_to_sorter( + &mut self.spilled_entries[bucket], + &mut self.deladd_buffer, + &mut self.cbo_buffer, + key, + deladd, + ) + } + } + } +} + +#[inline] +fn compute_bucket_from_hash(buckets: usize, hash: u64) -> usize { + hash as usize % buckets +} + +fn spill_entry_to_sorter( + spilled_entries: &mut grenad::Sorter, + deladd_buffer: &mut Vec, + cbo_buffer: &mut Vec, + key: &[u8], + deladd: DelAddRoaringBitmap, +) -> Result<()> { + deladd_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(deladd_buffer); + + match deladd { + DelAddRoaringBitmap { del: Some(del), add: None } => { + cbo_buffer.clear(); + CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); + value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; + } + DelAddRoaringBitmap { del: None, add: Some(add) } => { + cbo_buffer.clear(); + CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); + value_writer.insert(DelAdd::Addition, &cbo_buffer)?; + } + DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { + cbo_buffer.clear(); + CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); + value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; + + cbo_buffer.clear(); + CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); + value_writer.insert(DelAdd::Addition, &cbo_buffer)?; + } + DelAddRoaringBitmap { del: None, add: None } => return Ok(()), + } + + let bytes = value_writer.into_inner().unwrap(); + spilled_entries.insert(key, bytes).map_err(Into::into) +} + +pub struct FrozenCache<'a, 'extractor> { + bucket: usize, + cache: FrozenMap<'a, 'extractor, &'extractor [u8], DelAddRoaringBitmap, FxBuildHasher>, + spilled: Vec>>, +} + +pub fn transpose_and_freeze_caches<'a, 'extractor>( + caches: &'a mut [BalancedCaches<'extractor>], +) -> Result>>> { + let width = caches.first().map(BalancedCaches::buckets).unwrap_or(0); + let mut bucket_caches: Vec<_> = iter::repeat_with(Vec::new).take(width).collect(); + + for thread_cache in caches { + for frozen in thread_cache.freeze()? { + bucket_caches[frozen.bucket].push(frozen); + } + } + + Ok(bucket_caches) +} + +/// Merges the caches that must be all associated to the same bucket. +/// +/// # Panics +/// +/// - If the bucket IDs in these frozen caches are not exactly the same. +pub fn merge_caches(frozen: Vec, mut f: F) -> Result<()> +where + F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>, +{ + let mut maps = Vec::new(); + let mut readers = Vec::new(); + let mut current_bucket = None; + for FrozenCache { bucket, cache, ref mut spilled } in frozen { + assert_eq!(*current_bucket.get_or_insert(bucket), bucket); + maps.push(cache); + readers.append(spilled); + } + + // First manage the spilled entries by looking into the HashMaps, + // merge them and mark them as dummy. + let mut heap = BinaryHeap::new(); + for (source_index, source) in readers.into_iter().enumerate() { + let mut cursor = source.into_cursor()?; + if cursor.move_on_next()?.is_some() { + heap.push(Entry { cursor, source_index }); + } + } + + loop { + let mut first_entry = match heap.pop() { + Some(entry) => entry, + None => break, + }; + + let (first_key, first_value) = match first_entry.cursor.current() { + Some((key, value)) => (key, value), + None => break, + }; + + let mut output = DelAddRoaringBitmap::from_bytes(first_value)?; + while let Some(mut entry) = heap.peek_mut() { + if let Some((key, _value)) = entry.cursor.current() { + if first_key == key { + let new = DelAddRoaringBitmap::from_bytes(first_value)?; + output = output.merge(new); + // When we are done we the current value of this entry move make + // it move forward and let the heap reorganize itself (on drop) + if entry.cursor.move_on_next()?.is_none() { + PeekMut::pop(entry); + } + } else { + break; } } } - Ok(()) - } - - pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { - match self.cache.get_mut(key) { - Some(DelAddRoaringBitmap { del, add }) => { - del.get_or_insert_with(RoaringBitmap::default).insert(n); - add.get_or_insert_with(RoaringBitmap::default).insert(n); - } - None => { - self.total_insertions += 1; - self.fitted_in_key += (key.len() <= KEY_SIZE) as usize; - let value = DelAddRoaringBitmap::new_del_add_u32(n); - if let Some((key, deladd)) = self.cache.push(key.into(), value) { - self.write_entry(key, deladd)?; + // Once we merged all of the spilled bitmaps we must also + // fetch the entries from the non-spilled entries (the HashMaps). + for (map_index, map) in maps.iter_mut().enumerate() { + if first_entry.source_index != map_index { + if let Some(new) = map.get_mut(first_key) { + output = output.merge(mem::take(new)); } } } - Ok(()) + // We send the merged entry outside. + (f)(first_key, output)?; + + // Don't forget to put the first entry back into the heap. + if first_entry.cursor.move_on_next()?.is_some() { + heap.push(first_entry) + } } - fn write_entry>( - &mut self, - key: A, - deladd: DelAddRoaringBitmap, - ) -> grenad::Result<(), MF::Error> { - /// TODO we must create a serialization trait to correctly serialize bitmaps - self.deladd_buffer.clear(); - let mut value_writer = KvWriterDelAdd::new(&mut self.deladd_buffer); - match deladd { - DelAddRoaringBitmap { del: Some(del), add: None } => { - self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, &mut self.cbo_buffer); - value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?; - } - DelAddRoaringBitmap { del: None, add: Some(add) } => { - self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, &mut self.cbo_buffer); - value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?; - } - DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { - self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, &mut self.cbo_buffer); - value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?; + // Then manage the content on the HashMap entries that weren't taken (mem::take). + while let Some(mut map) = maps.pop() { + for (key, output) in map.iter_mut() { + let mut output = mem::take(output); - self.cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, &mut self.cbo_buffer); - value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?; + // Make sure we don't try to work with entries already managed by the spilled + if !output.is_empty() { + for rhs in maps.iter_mut() { + if let Some(new) = rhs.get_mut(key) { + output = output.merge(mem::take(new)); + } + } + + // We send the merged entry outside. + (f)(key, output)?; } - DelAddRoaringBitmap { del: None, add: None } => return Ok(()), } - let bytes = value_writer.into_inner().unwrap(); - self.sorter.insert(key, bytes) } - pub fn direct_insert(&mut self, key: &[u8], val: &[u8]) -> grenad::Result<(), MF::Error> { - self.sorter.insert(key, val) - } + Ok(()) +} - pub fn into_sorter(mut self) -> grenad::Result, MF::Error> { - let mut all_n_containers = Vec::new(); - let mut all_n_array_containers = Vec::new(); - let mut all_n_bitset_containers = Vec::new(); - let mut all_n_values_array_containers = Vec::new(); - let mut all_n_values_bitset_containers = Vec::new(); - let mut all_cardinality = Vec::new(); +struct Entry { + cursor: ReaderCursor, + source_index: usize, +} - let default_arc = Lru::new(NonZeroUsize::MIN); - for (key, deladd) in mem::replace(&mut self.cache, default_arc) { - for bitmap in [&deladd.del, &deladd.add].into_iter().flatten() { - let Statistics { - n_containers, - n_array_containers, - n_bitset_containers, - n_values_array_containers, - n_values_bitset_containers, - cardinality, - .. - } = bitmap.statistics(); - all_n_containers.push(n_containers); - all_n_array_containers.push(n_array_containers); - all_n_bitset_containers.push(n_bitset_containers); - all_n_values_array_containers.push(n_values_array_containers); - all_n_values_bitset_containers.push(n_values_bitset_containers); - all_cardinality.push(cardinality as u32); - } - - self.write_entry(key, deladd)?; - } - - let mut output = String::new(); - - for (name, mut slice) in [ - ("n_containers", all_n_containers), - ("n_array_containers", all_n_array_containers), - ("n_bitset_containers", all_n_bitset_containers), - ("n_values_array_containers", all_n_values_array_containers), - ("n_values_bitset_containers", all_n_values_bitset_containers), - ("cardinality", all_cardinality), - ] { - let _ = writeln!(&mut output, "{name} (p100) {:?}", Stats::from_slice(&mut slice)); - // let _ = writeln!(&mut output, "{name} (p99) {:?}", Stats::from_slice_p99(&mut slice)); - } - - let _ = writeln!( - &mut output, - "LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions", - self.fitted_in_key, - (self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0, - self.total_insertions, - ); - - eprintln!("{output}"); - - Ok(self.sorter) +impl Ord for Entry { + fn cmp(&self, other: &Entry) -> Ordering { + let skey = self.cursor.current().map(|(k, _)| k); + let okey = other.cursor.current().map(|(k, _)| k); + skey.cmp(&okey).then(self.source_index.cmp(&other.source_index)).reverse() } } -#[derive(Default, Debug)] -struct Stats { - pub len: usize, - pub average: f32, - pub mean: u32, - pub min: u32, - pub max: u32, -} +impl Eq for Entry {} -impl Stats { - fn from_slice(slice: &mut [u32]) -> Stats { - slice.sort_unstable(); - Self::from_sorted_slice(slice) - } - - fn from_slice_p99(slice: &mut [u32]) -> Stats { - slice.sort_unstable(); - let new_len = slice.len() - (slice.len() as f32 / 100.0) as usize; - match slice.get(..new_len) { - Some(slice) => Self::from_sorted_slice(slice), - None => Stats::default(), - } - } - - fn from_sorted_slice(slice: &[u32]) -> Stats { - let sum: f64 = slice.iter().map(|i| *i as f64).sum(); - let average = (sum / slice.len() as f64) as f32; - let mean = *slice.len().checked_div(2).and_then(|middle| slice.get(middle)).unwrap_or(&0); - let min = *slice.first().unwrap_or(&0); - let max = *slice.last().unwrap_or(&0); - Stats { len: slice.len(), average, mean, min, max } +impl PartialEq for Entry { + fn eq(&self, other: &Entry) -> bool { + self.cmp(other) == Ordering::Equal } } -#[derive(Debug, Clone)] +impl PartialOrd for Entry { + fn partial_cmp(&self, other: &Entry) -> Option { + Some(self.cmp(other)) + } +} + +#[derive(Debug, Default, Clone)] pub struct DelAddRoaringBitmap { - pub(crate) del: Option, - pub(crate) add: Option, + pub del: Option, + pub add: Option, } impl DelAddRoaringBitmap { - fn new_del_add_u32(n: u32) -> Self { - DelAddRoaringBitmap { - del: Some(RoaringBitmap::from([n])), - add: Some(RoaringBitmap::from([n])), - } + fn from_bytes(bytes: &[u8]) -> io::Result { + let reader = KvReaderDelAdd::from_slice(bytes); + + let del = match reader.get(DelAdd::Deletion) { + Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?, + None => None, + }; + + let add = match reader.get(DelAdd::Addition) { + Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?, + None => None, + }; + + Ok(DelAddRoaringBitmap { del, add }) } - fn new_del(bitmap: RoaringBitmap) -> Self { - DelAddRoaringBitmap { del: Some(bitmap), add: None } + pub fn empty() -> DelAddRoaringBitmap { + DelAddRoaringBitmap { del: None, add: None } } - fn new_del_u32(n: u32) -> Self { + pub fn is_empty(&self) -> bool { + let DelAddRoaringBitmap { del, add } = self; + del.is_none() && add.is_none() + } + + pub fn insert_del_u32(&mut self, n: u32) { + self.del.get_or_insert_with(RoaringBitmap::new).insert(n); + } + + pub fn insert_add_u32(&mut self, n: u32) { + self.add.get_or_insert_with(RoaringBitmap::new).insert(n); + } + + pub fn new_del_u32(n: u32) -> Self { DelAddRoaringBitmap { del: Some(RoaringBitmap::from([n])), add: None } } - fn new_add(bitmap: RoaringBitmap) -> Self { - DelAddRoaringBitmap { del: None, add: Some(bitmap) } - } - - fn new_add_u32(n: u32) -> Self { + pub fn new_add_u32(n: u32) -> Self { DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) } } + + pub fn merge(self, rhs: DelAddRoaringBitmap) -> DelAddRoaringBitmap { + let DelAddRoaringBitmap { del, add } = self; + let DelAddRoaringBitmap { del: ndel, add: nadd } = rhs; + + let del = match (del, ndel) { + (None, None) => None, + (None, Some(del)) | (Some(del), None) => Some(del), + (Some(del), Some(ndel)) => Some(del | ndel), + }; + + let add = match (add, nadd) { + (None, None) => None, + (None, Some(add)) | (Some(add), None) => Some(add), + (Some(add), Some(nadd)) => Some(add | nadd), + }; + + DelAddRoaringBitmap { del, add } + } + + pub fn apply_to(&self, documents_ids: &mut RoaringBitmap) { + let DelAddRoaringBitmap { del, add } = self; + + if let Some(del) = del { + *documents_ids -= del; + } + + if let Some(add) = add { + *documents_ids |= add; + } + } } diff --git a/milli/src/update/new/extract/documents.rs b/milli/src/update/new/extract/documents.rs new file mode 100644 index 000000000..21fe4d518 --- /dev/null +++ b/milli/src/update/new/extract/documents.rs @@ -0,0 +1,73 @@ +use std::cell::RefCell; + +use bumpalo::Bump; + +use super::DelAddRoaringBitmap; +use crate::update::new::channel::DocumentsSender; +use crate::update::new::document::write_to_obkv; +use crate::update::new::indexer::document_changes::{ + DocumentChangeContext, Extractor, FullySend, RefCellExt as _, +}; +use crate::update::new::DocumentChange; +use crate::Result; + +pub struct DocumentsExtractor<'a> { + documents_sender: &'a DocumentsSender<'a>, +} + +impl<'a> DocumentsExtractor<'a> { + pub fn new(documents_sender: &'a DocumentsSender<'a>) -> Self { + Self { documents_sender } + } +} + +impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { + type Data = FullySend>; + + fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result { + Ok(FullySend(RefCell::new(DelAddRoaringBitmap::empty()))) + } + + fn process( + &self, + change: DocumentChange, + context: &DocumentChangeContext, + ) -> Result<()> { + let mut document_buffer = Vec::new(); + let mut delta_documents_ids = context.data.0.borrow_mut_or_yield(); + + let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield(); + let new_fields_ids_map = &*new_fields_ids_map; + let new_fields_ids_map = new_fields_ids_map.local_map(); + + let external_docid = change.external_docid().to_owned(); + + // document but we need to create a function that collects and compresses documents. + match change { + DocumentChange::Deletion(deletion) => { + let docid = deletion.docid(); + self.documents_sender.delete(docid, external_docid).unwrap(); + delta_documents_ids.insert_del_u32(docid); + } + /// TODO: change NONE by SOME(vector) when implemented + DocumentChange::Update(update) => { + let docid = update.docid(); + let content = + update.new(&context.txn, context.index, &context.db_fields_ids_map)?; + let content = + write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; + self.documents_sender.uncompressed(docid, external_docid, content).unwrap(); + } + DocumentChange::Insertion(insertion) => { + let docid = insertion.docid(); + let content = insertion.new(); + let content = + write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; + self.documents_sender.uncompressed(docid, external_docid, content).unwrap(); + delta_documents_ids.insert_add_u32(docid); + // extracted_dictionary_sender.send(self, dictionary: &[u8]); + } + } + Ok(()) + } +} diff --git a/milli/src/update/new/extract/faceted/extract_facets.rs b/milli/src/update/new/extract/faceted/extract_facets.rs index 9f3ed18d8..6844dd6f2 100644 --- a/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/milli/src/update/new/extract/faceted/extract_facets.rs @@ -1,16 +1,12 @@ use std::cell::RefCell; use std::collections::HashSet; -use std::fmt::Debug; -use std::fs::File; use std::ops::DerefMut as _; use bumpalo::Bump; -use grenad::{MergeFunction, Merger}; use heed::RoTxn; -use rayon::iter::{ParallelBridge as _, ParallelIterator as _}; use serde_json::Value; -use super::super::cache::CboCachedSorter; +use super::super::cache::BalancedCaches; use super::facet_document::extract_document_facets; use super::FacetKind; use crate::facet::value_encoding::f64_into_bytes; @@ -20,44 +16,30 @@ use crate::update::new::indexer::document_changes::{ IndexingContext, RefCellExt, ThreadLocal, }; use crate::update::new::DocumentChange; -use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::update::GrenadParameters; use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; -pub struct FacetedExtractorData<'extractor> { - attributes_to_extract: &'extractor [&'extractor str], +pub struct FacetedExtractorData<'a> { + attributes_to_extract: &'a [&'a str], grenad_parameters: GrenadParameters, - max_memory: Option, + buckets: usize, } -impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> { - type Data = FullySend>>; +impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> { + type Data = RefCell>; - fn init_data( - &self, - _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, - ) -> Result { - Ok(FullySend(RefCell::new(CboCachedSorter::new( - // TODO use a better value - 1_000_000.try_into().unwrap(), - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - self.grenad_parameters.chunk_compression_type, - self.grenad_parameters.chunk_compression_level, - self.grenad_parameters.max_nb_chunks, - self.max_memory, - // *NOTE*: this must not be set to true: - // 1. we're already using max parallelism in the pool, so it wouldn't help - // 2. it creates correctness issues if it causes to yield a borrow-mut wielding task - false, - ), - )))) + fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { + Ok(RefCell::new(BalancedCaches::new_in( + self.buckets, + self.grenad_parameters.max_memory, + extractor_alloc, + ))) } fn process( &self, change: DocumentChange, - context: &crate::update::new::indexer::document_changes::DocumentChangeContext, + context: &DocumentChangeContext, ) -> Result<()> { FacetedDocidsExtractor::extract_document_change(context, self.attributes_to_extract, change) } @@ -67,16 +49,14 @@ pub struct FacetedDocidsExtractor; impl FacetedDocidsExtractor { fn extract_document_change( - context: &DocumentChangeContext< - FullySend>>, - >, + context: &DocumentChangeContext>, attributes_to_extract: &[&str], document_change: DocumentChange, ) -> Result<()> { let index = &context.index; let rtxn = &context.txn; let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); - let mut cached_sorter = context.data.0.borrow_mut_or_yield(); + let mut cached_sorter = context.data.borrow_mut_or_yield(); match document_change { DocumentChange::Deletion(inner) => extract_document_facets( attributes_to_extract, @@ -86,7 +66,7 @@ impl FacetedDocidsExtractor { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), - CboCachedSorter::insert_del_u32, + BalancedCaches::insert_del_u32, inner.docid(), fid, value, @@ -102,7 +82,7 @@ impl FacetedDocidsExtractor { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), - CboCachedSorter::insert_del_u32, + BalancedCaches::insert_del_u32, inner.docid(), fid, value, @@ -118,7 +98,7 @@ impl FacetedDocidsExtractor { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), - CboCachedSorter::insert_add_u32, + BalancedCaches::insert_add_u32, inner.docid(), fid, value, @@ -134,7 +114,7 @@ impl FacetedDocidsExtractor { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), - CboCachedSorter::insert_add_u32, + BalancedCaches::insert_add_u32, inner.docid(), fid, value, @@ -144,25 +124,20 @@ impl FacetedDocidsExtractor { } } - fn facet_fn_with_options( + fn facet_fn_with_options<'extractor>( doc_alloc: &Bump, - cached_sorter: &mut CboCachedSorter, - cache_fn: impl Fn(&mut CboCachedSorter, &[u8], u32) -> grenad::Result<(), MF::Error>, + cached_sorter: &mut BalancedCaches<'extractor>, + cache_fn: impl Fn(&mut BalancedCaches<'extractor>, &[u8], u32) -> Result<()>, docid: DocumentId, fid: FieldId, value: &Value, - ) -> Result<()> - where - MF: MergeFunction, - MF::Error: Debug, - grenad::Error: Into, - { + ) -> Result<()> { let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); // Exists // key: fid buffer.push(FacetKind::Exists as u8); buffer.extend_from_slice(&fid.to_be_bytes()); - cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)?; + cache_fn(cached_sorter, &buffer, docid)?; match value { // Number @@ -177,8 +152,7 @@ impl FacetedDocidsExtractor { buffer.push(0); // level 0 buffer.extend_from_slice(&ordered); buffer.extend_from_slice(&n.to_be_bytes()); - - cache_fn(cached_sorter, &buffer, docid).map_err(Into::into) + cache_fn(cached_sorter, &buffer, docid) } else { Ok(()) } @@ -193,7 +167,7 @@ impl FacetedDocidsExtractor { buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(0); // level 0 buffer.extend_from_slice(truncated.as_bytes()); - cache_fn(cached_sorter, &buffer, docid).map_err(Into::into) + cache_fn(cached_sorter, &buffer, docid) } // Null // key: fid @@ -201,7 +175,7 @@ impl FacetedDocidsExtractor { buffer.clear(); buffer.push(FacetKind::Null as u8); buffer.extend_from_slice(&fid.to_be_bytes()); - cache_fn(cached_sorter, &buffer, docid).map_err(Into::into) + cache_fn(cached_sorter, &buffer, docid) } // Empty // key: fid @@ -209,13 +183,13 @@ impl FacetedDocidsExtractor { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); - cache_fn(cached_sorter, &buffer, docid).map_err(Into::into) + cache_fn(cached_sorter, &buffer, docid) } Value::Object(o) if o.is_empty() => { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); - cache_fn(cached_sorter, &buffer, docid).map_err(Into::into) + cache_fn(cached_sorter, &buffer, docid) } // Otherwise, do nothing /// TODO: What about Value::Bool? @@ -242,16 +216,13 @@ fn truncate_str(s: &str) -> &str { impl DocidsExtractor for FacetedDocidsExtractor { #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] - fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>( grenad_parameters: GrenadParameters, document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index>, - extractor_allocs: &mut ThreadLocal>>, - ) -> Result> { - let max_memory = grenad_parameters.max_memory_by_thread(); - + extractor_allocs: &'extractor mut ThreadLocal>, + ) -> Result>> { let index = indexing_context.index; - let rtxn = index.read_txn()?; let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; let attributes_to_extract: Vec<_> = @@ -266,7 +237,7 @@ impl DocidsExtractor for FacetedDocidsExtractor { let extractor = FacetedExtractorData { attributes_to_extract: &attributes_to_extract, grenad_parameters, - max_memory, + buckets: rayon::current_num_threads(), }; for_each_document_change( document_changes, @@ -276,26 +247,7 @@ impl DocidsExtractor for FacetedDocidsExtractor { &datastore, )?; } - { - let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); - let span = - tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); - let _entered = span.enter(); - let readers: Vec<_> = datastore - .into_iter() - .par_bridge() - .map(|cached_sorter| { - let cached_sorter = cached_sorter.0.into_inner(); - let sorter = cached_sorter.into_sorter()?; - sorter.into_reader_cursors() - }) - .collect(); - - for reader in readers { - builder.extend(reader?); - } - Ok(builder.build()) - } + Ok(datastore.into_iter().map(RefCell::into_inner).collect()) } } diff --git a/milli/src/update/new/extract/lru.rs b/milli/src/update/new/extract/lru.rs deleted file mode 100644 index 3eca47cb2..000000000 --- a/milli/src/update/new/extract/lru.rs +++ /dev/null @@ -1,234 +0,0 @@ -use std::borrow::Borrow; -use std::hash::{BuildHasher, Hash}; -use std::iter::repeat_with; -use std::mem; -use std::num::NonZeroUsize; - -use hashbrown::hash_map::{DefaultHashBuilder, Entry}; -use hashbrown::HashMap; - -#[derive(Debug)] -pub struct Lru { - lookup: HashMap, - storage: FixedSizeList>, -} - -impl Lru { - /// Creates a new LRU cache that holds at most `capacity` elements. - pub fn new(capacity: NonZeroUsize) -> Self { - Self { lookup: HashMap::new(), storage: FixedSizeList::new(capacity.get()) } - } -} - -impl Lru { - /// Creates a new LRU cache that holds at most `capacity` elements - /// and uses the provided hash builder to hash keys. - pub fn with_hasher(capacity: NonZeroUsize, hash_builder: S) -> Lru { - Self { - lookup: HashMap::with_hasher(hash_builder), - storage: FixedSizeList::new(capacity.get()), - } - } -} - -impl Lru { - /// Returns a mutable reference to the value of the key in the cache or `None` if it is not present in the cache. - /// - /// Moves the key to the head of the LRU list if it exists. - pub fn get_mut(&mut self, key: &Q) -> Option<&mut V> - where - K: Borrow, - Q: Hash + Eq + ?Sized, - { - let idx = *self.lookup.get(key)?; - self.storage.move_front(idx).map(|node| &mut node.value) - } -} - -impl Lru { - pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> { - match self.lookup.entry(key) { - Entry::Occupied(occ) => { - // It's fine to unwrap here because: - // * the entry already exists - let node = self.storage.move_front(*occ.get()).unwrap(); - let old_value = mem::replace(&mut node.value, value); - let old_key = occ.replace_key(); - Some((old_key, old_value)) - } - Entry::Vacant(vac) => { - let key = vac.key().clone(); - if self.storage.is_full() { - // It's fine to unwrap here because: - // * the cache capacity is non zero - // * the cache is full - let idx = self.storage.back_idx(); - let node = self.storage.move_front(idx).unwrap(); - let LruNode { key, value } = mem::replace(node, LruNode { key, value }); - vac.insert(idx); - self.lookup.remove(&key); - Some((key, value)) - } else { - // It's fine to unwrap here because: - // * the cache capacity is non zero - // * the cache is not full - let (idx, _) = self.storage.push_front(LruNode { key, value }).unwrap(); - vac.insert(idx); - None - } - } - } - } -} - -impl IntoIterator for Lru { - type Item = (K, V); - type IntoIter = IntoIter; - - fn into_iter(self) -> Self::IntoIter { - IntoIter { lookup_iter: self.lookup.into_iter(), nodes: self.storage.nodes } - } -} - -pub struct IntoIter { - lookup_iter: hashbrown::hash_map::IntoIter, - nodes: Box<[Option>>]>, -} - -impl Iterator for IntoIter { - type Item = (K, V); - - fn next(&mut self) -> Option { - let (_key, idx) = self.lookup_iter.next()?; - let LruNode { key, value } = self.nodes.get_mut(idx)?.take()?.data; - Some((key, value)) - } -} - -#[derive(Debug)] -struct LruNode { - key: K, - value: V, -} - -#[derive(Debug)] -struct FixedSizeListNode { - prev: usize, - next: usize, - data: T, -} - -#[derive(Debug)] -struct FixedSizeList { - nodes: Box<[Option>]>, - /// Also corresponds to the first `None` in the nodes. - length: usize, - // TODO Also, we probably do not need one of the front and back cursors. - front: usize, - back: usize, -} - -impl FixedSizeList { - fn new(capacity: usize) -> Self { - Self { - nodes: repeat_with(|| None).take(capacity).collect::>().into_boxed_slice(), - length: 0, - front: usize::MAX, - back: usize::MAX, - } - } - - #[inline] - fn capacity(&self) -> usize { - self.nodes.len() - } - - #[inline] - fn len(&self) -> usize { - self.length - } - - #[inline] - fn is_empty(&self) -> bool { - self.len() == 0 - } - - #[inline] - fn is_full(&self) -> bool { - self.len() == self.capacity() - } - - #[inline] - fn back_idx(&self) -> usize { - self.back - } - - #[inline] - fn next(&mut self) -> Option { - if self.is_full() { - None - } else { - let current_free = self.length; - self.length += 1; - Some(current_free) - } - } - - #[inline] - fn node_mut(&mut self, idx: usize) -> Option<&mut FixedSizeListNode> { - self.nodes.get_mut(idx).and_then(|node| node.as_mut()) - } - - #[inline] - fn node_ref(&self, idx: usize) -> Option<&FixedSizeListNode> { - self.nodes.get(idx).and_then(|node| node.as_ref()) - } - - #[inline] - fn move_front(&mut self, idx: usize) -> Option<&mut T> { - let node = self.nodes.get_mut(idx)?.take()?; - if let Some(prev) = self.node_mut(node.prev) { - prev.next = node.next; - } else { - self.front = node.next; - } - if let Some(next) = self.node_mut(node.next) { - next.prev = node.prev; - } else { - self.back = node.prev; - } - - if let Some(front) = self.node_mut(self.front) { - front.prev = idx; - } - if self.node_ref(self.back).is_none() { - self.back = idx; - } - - let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode { - prev: usize::MAX, - next: self.front, - data: node.data, - }); - self.front = idx; - Some(&mut node.data) - } - - #[inline] - fn push_front(&mut self, data: T) -> Option<(usize, &mut T)> { - let idx = self.next()?; - if let Some(front) = self.node_mut(self.front) { - front.prev = idx; - } - if self.node_ref(self.back).is_none() { - self.back = idx; - } - let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode { - prev: usize::MAX, - next: self.front, - data, - }); - self.front = idx; - Some((idx, &mut node.data)) - } -} diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 1c86d80af..3271c454f 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -1,27 +1,25 @@ mod cache; +mod documents; mod faceted; -mod lru; mod searchable; -use std::cell::RefCell; -use std::fs::File; - use bumpalo::Bump; +pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap}; +pub use documents::*; pub use faceted::*; -use grenad::Merger; pub use searchable::*; use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal}; -use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::update::GrenadParameters; use crate::Result; pub trait DocidsExtractor { - fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>( grenad_parameters: GrenadParameters, document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index>, - extractor_allocs: &mut ThreadLocal>>, - ) -> Result>; + extractor_allocs: &'extractor mut ThreadLocal>, + ) -> Result>>; } /// TODO move in permissive json pointer diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index a5cbd3700..23bca784f 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -1,113 +1,46 @@ use std::cell::RefCell; use std::collections::HashMap; -use std::fs::File; use std::mem::size_of; -use std::num::NonZero; use std::ops::DerefMut as _; use bumpalo::collections::vec::Vec as BumpVec; use bumpalo::Bump; -use grenad::{Merger, MergerBuilder}; use heed::RoTxn; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use crate::update::new::extract::cache::CboCachedSorter; +use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, - IndexingContext, RefCellExt, ThreadLocal, + IndexingContext, MostlySend, RefCellExt, ThreadLocal, }; use crate::update::new::DocumentChange; -use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::update::GrenadParameters; use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; const MAX_COUNTED_WORDS: usize = 30; -pub struct WordDocidsCachedSorters { - word_fid_docids: CboCachedSorter, - word_docids: CboCachedSorter, - exact_word_docids: CboCachedSorter, - word_position_docids: CboCachedSorter, - fid_word_count_docids: CboCachedSorter, +pub struct WordDocidsBalancedCaches<'extractor> { + word_fid_docids: BalancedCaches<'extractor>, + word_docids: BalancedCaches<'extractor>, + exact_word_docids: BalancedCaches<'extractor>, + word_position_docids: BalancedCaches<'extractor>, + fid_word_count_docids: BalancedCaches<'extractor>, fid_word_count: HashMap, current_docid: Option, } -impl WordDocidsCachedSorters { - pub fn new( - indexer: GrenadParameters, - max_memory: Option, - capacity: NonZero, - ) -> Self { - let max_memory = max_memory.map(|max_memory| max_memory / 4); - - let word_fid_docids = CboCachedSorter::new( - capacity, - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - false, - ), - ); - let word_docids = CboCachedSorter::new( - capacity, - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - false, - ), - ); - let exact_word_docids = CboCachedSorter::new( - capacity, - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - false, - ), - ); - let word_position_docids = CboCachedSorter::new( - capacity, - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - false, - ), - ); - let fid_word_count_docids = CboCachedSorter::new( - capacity, - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - false, - ), - ); +unsafe impl<'extractor> MostlySend for WordDocidsBalancedCaches<'extractor> {} +impl<'extractor> WordDocidsBalancedCaches<'extractor> { + /// TODO Make sure to give the same max_memory to all of them, without splitting it + pub fn new_in(buckets: usize, max_memory: Option, alloc: &'extractor Bump) -> Self { Self { - word_fid_docids, - word_docids, - exact_word_docids, - word_position_docids, - fid_word_count_docids, + word_fid_docids: BalancedCaches::new_in(buckets, max_memory, alloc), + word_docids: BalancedCaches::new_in(buckets, max_memory, alloc), + exact_word_docids: BalancedCaches::new_in(buckets, max_memory, alloc), + word_position_docids: BalancedCaches::new_in(buckets, max_memory, alloc), + fid_word_count_docids: BalancedCaches::new_in(buckets, max_memory, alloc), fid_word_count: HashMap::new(), current_docid: None, } @@ -198,6 +131,7 @@ impl WordDocidsCachedSorters { .entry(field_id) .and_modify(|(current_count, _new_count)| *current_count += 1) .or_insert((1, 0)); + self.current_docid = Some(docid); Ok(()) @@ -227,37 +161,29 @@ impl WordDocidsCachedSorters { } } -struct WordDocidsMergerBuilders { - word_fid_docids: MergerBuilder, - word_docids: MergerBuilder, - exact_word_docids: MergerBuilder, - word_position_docids: MergerBuilder, - fid_word_count_docids: MergerBuilder, +pub struct WordDocidsCaches<'extractor> { + pub word_docids: Vec>, + pub word_fid_docids: Vec>, + pub exact_word_docids: Vec>, + pub word_position_docids: Vec>, + pub fid_word_count_docids: Vec>, } -pub struct WordDocidsMergers { - pub word_fid_docids: Merger, - pub word_docids: Merger, - pub exact_word_docids: Merger, - pub word_position_docids: Merger, - pub fid_word_count_docids: Merger, -} - -impl WordDocidsMergerBuilders { +impl<'extractor> WordDocidsCaches<'extractor> { fn new() -> Self { Self { - word_fid_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), - word_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), - exact_word_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), - word_position_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), - fid_word_count_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), + word_docids: Vec::new(), + word_fid_docids: Vec::new(), + exact_word_docids: Vec::new(), + word_position_docids: Vec::new(), + fid_word_count_docids: Vec::new(), } } - fn add_sorters(&mut self, other: WordDocidsCachedSorters) -> Result<()> { - let WordDocidsCachedSorters { - word_fid_docids, + fn push(&mut self, other: WordDocidsBalancedCaches<'extractor>) -> Result<()> { + let WordDocidsBalancedCaches { word_docids, + word_fid_docids, exact_word_docids, word_position_docids, fid_word_count_docids, @@ -265,78 +191,37 @@ impl WordDocidsMergerBuilders { current_docid: _, } = other; - let mut word_fid_docids_readers = Ok(vec![]); - let mut word_docids_readers = Ok(vec![]); - let mut exact_word_docids_readers = Ok(vec![]); - let mut word_position_docids_readers = Ok(vec![]); - let mut fid_word_count_docids_readers = Ok(vec![]); - rayon::scope(|s| { - s.spawn(|_| { - word_fid_docids_readers = - word_fid_docids.into_sorter().and_then(|s| s.into_reader_cursors()); - }); - s.spawn(|_| { - word_docids_readers = - word_docids.into_sorter().and_then(|s| s.into_reader_cursors()); - }); - s.spawn(|_| { - exact_word_docids_readers = - exact_word_docids.into_sorter().and_then(|s| s.into_reader_cursors()); - }); - s.spawn(|_| { - word_position_docids_readers = - word_position_docids.into_sorter().and_then(|s| s.into_reader_cursors()); - }); - s.spawn(|_| { - fid_word_count_docids_readers = - fid_word_count_docids.into_sorter().and_then(|s| s.into_reader_cursors()); - }); - }); - self.word_fid_docids.extend(word_fid_docids_readers?); - self.word_docids.extend(word_docids_readers?); - self.exact_word_docids.extend(exact_word_docids_readers?); - self.word_position_docids.extend(word_position_docids_readers?); - self.fid_word_count_docids.extend(fid_word_count_docids_readers?); + self.word_docids.push(word_docids); + self.word_fid_docids.push(word_fid_docids); + self.exact_word_docids.push(exact_word_docids); + self.word_position_docids.push(word_position_docids); + self.fid_word_count_docids.push(fid_word_count_docids); Ok(()) } - - fn build(self) -> WordDocidsMergers { - WordDocidsMergers { - word_fid_docids: self.word_fid_docids.build(), - word_docids: self.word_docids.build(), - exact_word_docids: self.exact_word_docids.build(), - word_position_docids: self.word_position_docids.build(), - fid_word_count_docids: self.fid_word_count_docids.build(), - } - } } -pub struct WordDocidsExtractorData<'extractor> { - tokenizer: &'extractor DocumentTokenizer<'extractor>, +pub struct WordDocidsExtractorData<'a> { + tokenizer: &'a DocumentTokenizer<'a>, grenad_parameters: GrenadParameters, - max_memory: Option, + buckets: usize, } -impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> { - type Data = FullySend>; +impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { + type Data = RefCell>>; - fn init_data( - &self, - _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, - ) -> Result { - Ok(FullySend(RefCell::new(WordDocidsCachedSorters::new( - self.grenad_parameters, - self.max_memory, - // TODO use a better value - 200_000.try_into().unwrap(), + fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { + Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in( + self.buckets, + self.grenad_parameters.max_memory, + extractor_alloc, )))) } fn process( &self, change: DocumentChange, - context: &crate::update::new::indexer::document_changes::DocumentChangeContext, + context: &DocumentChangeContext, ) -> Result<()> { WordDocidsExtractors::extract_document_change(context, self.tokenizer, change) } @@ -345,16 +230,15 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> { pub struct WordDocidsExtractors; impl WordDocidsExtractors { - pub fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>( grenad_parameters: GrenadParameters, document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index>, - extractor_allocs: &mut ThreadLocal>>, - ) -> Result { - let max_memory = grenad_parameters.max_memory_by_thread(); + extractor_allocs: &'extractor mut ThreadLocal>, + ) -> Result> { let index = indexing_context.index; - let rtxn = index.read_txn()?; + let stop_words = index.stop_words(&rtxn)?; let allowed_separators = index.allowed_separators(&rtxn)?; let allowed_separators: Option> = @@ -392,7 +276,7 @@ impl WordDocidsExtractors { let extractor = WordDocidsExtractorData { tokenizer: &document_tokenizer, grenad_parameters, - max_memory, + buckets: rayon::current_num_threads(), }; for_each_document_change( @@ -404,28 +288,23 @@ impl WordDocidsExtractors { )?; } - { - let span = - tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); - let _entered = span.enter(); - let mut builder = WordDocidsMergerBuilders::new(); - for cache in datastore.into_iter().map(|cache| cache.0.into_inner()) { - builder.add_sorters(cache)?; - } - - Ok(builder.build()) + let mut merger = WordDocidsCaches::new(); + for cache in datastore.into_iter().flat_map(RefCell::into_inner) { + merger.push(cache)?; } + + Ok(merger) } fn extract_document_change( - context: &DocumentChangeContext>>, + context: &DocumentChangeContext>>, document_tokenizer: &DocumentTokenizer, document_change: DocumentChange, ) -> Result<()> { let index = &context.index; let rtxn = &context.txn; - let mut cached_sorter = context.data.0.borrow_mut_or_yield(); - let cached_sorter = cached_sorter.deref_mut(); + let mut cached_sorter_ref = context.data.borrow_mut_or_yield(); + let cached_sorter = cached_sorter_ref.as_mut().unwrap(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let new_fields_ids_map = new_fields_ids_map.deref_mut(); let doc_alloc = &context.doc_alloc; @@ -436,16 +315,14 @@ impl WordDocidsExtractors { match document_change { DocumentChange::Deletion(inner) => { let mut token_fn = |fname: &str, fid, pos, word: &str| { - cached_sorter - .insert_del_u32( - fid, - pos, - word, - is_exact_attribute(fname), - inner.docid(), - doc_alloc, - ) - .map_err(crate::Error::from) + cached_sorter.insert_del_u32( + fid, + pos, + word, + is_exact_attribute(fname), + inner.docid(), + doc_alloc, + ) }; document_tokenizer.tokenize_document( inner.current(rtxn, index, context.db_fields_ids_map)?, @@ -455,16 +332,14 @@ impl WordDocidsExtractors { } DocumentChange::Update(inner) => { let mut token_fn = |fname: &str, fid, pos, word: &str| { - cached_sorter - .insert_del_u32( - fid, - pos, - word, - is_exact_attribute(fname), - inner.docid(), - doc_alloc, - ) - .map_err(crate::Error::from) + cached_sorter.insert_del_u32( + fid, + pos, + word, + is_exact_attribute(fname), + inner.docid(), + doc_alloc, + ) }; document_tokenizer.tokenize_document( inner.current(rtxn, index, context.db_fields_ids_map)?, @@ -473,16 +348,14 @@ impl WordDocidsExtractors { )?; let mut token_fn = |fname: &str, fid, pos, word: &str| { - cached_sorter - .insert_add_u32( - fid, - pos, - word, - is_exact_attribute(fname), - inner.docid(), - doc_alloc, - ) - .map_err(crate::Error::from) + cached_sorter.insert_add_u32( + fid, + pos, + word, + is_exact_attribute(fname), + inner.docid(), + doc_alloc, + ) }; document_tokenizer.tokenize_document( inner.new(rtxn, index, context.db_fields_ids_map)?, @@ -492,16 +365,14 @@ impl WordDocidsExtractors { } DocumentChange::Insertion(inner) => { let mut token_fn = |fname: &str, fid, pos, word: &str| { - cached_sorter - .insert_add_u32( - fid, - pos, - word, - is_exact_attribute(fname), - inner.docid(), - doc_alloc, - ) - .map_err(crate::Error::from) + cached_sorter.insert_add_u32( + fid, + pos, + word, + is_exact_attribute(fname), + inner.docid(), + doc_alloc, + ) }; document_tokenizer.tokenize_document( inner.new(), diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 53e6515a9..6f354688c 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -8,13 +8,13 @@ use super::tokenize_document::DocumentTokenizer; use super::SearchableExtractor; use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::update::new::document::Document; -use crate::update::new::extract::cache::CboCachedSorter; -use crate::update::new::indexer::document_changes::{DocumentChangeContext, FullySend, RefCellExt}; +use crate::update::new::extract::cache::BalancedCaches; +use crate::update::new::indexer::document_changes::{DocumentChangeContext, RefCellExt}; use crate::update::new::DocumentChange; -use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; pub struct WordPairProximityDocidsExtractor; + impl SearchableExtractor for WordPairProximityDocidsExtractor { fn attributes_to_extract<'a>( rtxn: &'a RoTxn, @@ -28,11 +28,10 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { } // This method is reimplemented to count the number of words in the document in each field - // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS. + // and to store the docids of the documents that have a number of words in a given field + // equal to or under than MAX_COUNTED_WORDS. fn extract_document_change( - context: &DocumentChangeContext< - FullySend>>, - >, + context: &DocumentChangeContext>, document_tokenizer: &DocumentTokenizer, document_change: DocumentChange, ) -> Result<()> { @@ -48,7 +47,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let new_fields_ids_map = &mut *new_fields_ids_map; - let mut cached_sorter = context.data.0.borrow_mut_or_yield(); + let mut cached_sorter = context.data.borrow_mut_or_yield(); let cached_sorter = &mut *cached_sorter; // is a vecdequeue, and will be smol, so can stay on the heap for now @@ -139,7 +138,7 @@ fn build_key<'a>( fn word_positions_into_word_pair_proximity( word_positions: &mut VecDeque<(Rc, u16)>, word_pair_proximity: &mut impl FnMut((Rc, Rc), u8), -) -> Result<()> { +) { let (head_word, head_position) = word_positions.pop_front().unwrap(); for (word, position) in word_positions.iter() { let prox = index_proximity(head_position as u32, *position as u32) as u8; @@ -147,7 +146,6 @@ fn word_positions_into_word_pair_proximity( word_pair_proximity((head_word.clone(), word.clone()), prox); } } - Ok(()) } fn process_document_tokens<'doc>( @@ -163,7 +161,7 @@ fn process_document_tokens<'doc>( .front() .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE) { - word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?; + word_positions_into_word_pair_proximity(word_positions, word_pair_proximity); } // insert the new word. @@ -173,7 +171,7 @@ fn process_document_tokens<'doc>( document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?; while !word_positions.is_empty() { - word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?; + word_positions_into_word_pair_proximity(word_positions, word_pair_proximity); } Ok(()) diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index 8934ee892..374718def 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -3,76 +3,60 @@ mod extract_word_pair_proximity_docids; mod tokenize_document; use std::cell::RefCell; -use std::fs::File; use std::marker::PhantomData; use bumpalo::Bump; -pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers}; +pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors}; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; -use grenad::Merger; use heed::RoTxn; -use rayon::iter::{ParallelBridge, ParallelIterator}; use tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use super::cache::CboCachedSorter; +use super::cache::BalancedCaches; use super::DocidsExtractor; use crate::update::new::indexer::document_changes::{ for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, ThreadLocal, }; use crate::update::new::DocumentChange; -use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::update::GrenadParameters; use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE}; -pub struct SearchableExtractorData<'extractor, EX: SearchableExtractor> { - tokenizer: &'extractor DocumentTokenizer<'extractor>, +pub struct SearchableExtractorData<'a, EX: SearchableExtractor> { + tokenizer: &'a DocumentTokenizer<'a>, grenad_parameters: GrenadParameters, - max_memory: Option, + buckets: usize, _ex: PhantomData, } -impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> - for SearchableExtractorData<'extractor, EX> +impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> + for SearchableExtractorData<'a, EX> { - type Data = FullySend>>; + type Data = RefCell>; - fn init_data( - &self, - _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, - ) -> Result { - Ok(FullySend(RefCell::new(CboCachedSorter::new( - // TODO use a better value - 1_000_000.try_into().unwrap(), - create_sorter( - grenad::SortAlgorithm::Stable, - MergeDeladdCboRoaringBitmaps, - self.grenad_parameters.chunk_compression_type, - self.grenad_parameters.chunk_compression_level, - self.grenad_parameters.max_nb_chunks, - self.max_memory, - false, - ), - )))) + fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { + Ok(RefCell::new(BalancedCaches::new_in( + self.buckets, + self.grenad_parameters.max_memory, + extractor_alloc, + ))) } fn process( &self, change: DocumentChange, - context: &crate::update::new::indexer::document_changes::DocumentChangeContext, + context: &DocumentChangeContext, ) -> Result<()> { EX::extract_document_change(context, self.tokenizer, change) } } pub trait SearchableExtractor: Sized + Sync { - fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>( grenad_parameters: GrenadParameters, document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index>, - extractor_allocs: &mut ThreadLocal>>, - ) -> Result> { - let max_memory = grenad_parameters.max_memory_by_thread(); - + extractor_allocs: &'extractor mut ThreadLocal>, + ) -> Result>> { let rtxn = indexing_context.index.read_txn()?; let stop_words = indexing_context.index.stop_words(&rtxn)?; let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; @@ -104,7 +88,7 @@ pub trait SearchableExtractor: Sized + Sync { let extractor_data: SearchableExtractorData = SearchableExtractorData { tokenizer: &document_tokenizer, grenad_parameters, - max_memory, + buckets: rayon::current_num_threads(), _ex: PhantomData, }; @@ -122,37 +106,12 @@ pub trait SearchableExtractor: Sized + Sync { &datastore, )?; } - { - let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); - let span = - tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); - let _entered = span.enter(); - let readers: Vec<_> = datastore - .into_iter() - .par_bridge() - .map(|cache_entry| { - let cached_sorter: FullySend< - RefCell>, - > = cache_entry; - let cached_sorter = cached_sorter.0.into_inner(); - let sorter = cached_sorter.into_sorter()?; - sorter.into_reader_cursors() - }) - .collect(); - - for reader in readers { - builder.extend(reader?); - } - - Ok(builder.build()) - } + Ok(datastore.into_iter().map(RefCell::into_inner).collect()) } fn extract_document_change( - context: &DocumentChangeContext< - FullySend>>, - >, + context: &DocumentChangeContext>, document_tokenizer: &DocumentTokenizer, document_change: DocumentChange, ) -> Result<()>; @@ -164,12 +123,12 @@ pub trait SearchableExtractor: Sized + Sync { } impl DocidsExtractor for T { - fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>( grenad_parameters: GrenadParameters, document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index>, - extractor_allocs: &mut ThreadLocal>>, - ) -> Result> { + extractor_allocs: &'extractor mut ThreadLocal>, + ) -> Result>> { Self::run_extraction( grenad_parameters, document_changes, diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index b8fd24f1b..7c4ada467 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -171,7 +171,6 @@ mod test { use bumpalo::Bump; use charabia::TokenizerBuilder; use meili_snap::snapshot; - use raw_collections::RawMap; use serde_json::json; use serde_json::value::RawValue; diff --git a/milli/src/update/new/facet_search_builder.rs b/milli/src/update/new/facet_search_builder.rs index 4602b5a30..b9db80afb 100644 --- a/milli/src/update/new/facet_search_builder.rs +++ b/milli/src/update/new/facet_search_builder.rs @@ -1,31 +1,24 @@ use std::collections::{BTreeSet, HashMap}; -use charabia::{normalizer::NormalizerOption, Language, Normalize, StrDetection, Token}; +use charabia::normalizer::NormalizerOption; +use charabia::{Language, Normalize, StrDetection, Token}; use grenad::Sorter; -use heed::{ - types::{Bytes, SerdeJson}, - BytesDecode, BytesEncode, RoTxn, -}; +use heed::types::{Bytes, SerdeJson}; +use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn}; +use super::channel::FacetSearchableSender; +use super::extract::FacetKind; +use super::fst_merger_builder::FstMergerBuilder; +use super::KvReaderDelAdd; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::StrRefCodec; +use crate::update::del_add::{DelAdd, KvWriterDelAdd}; +use crate::update::{create_sorter, MergeDeladdBtreesetString}; use crate::{ - heed_codec::{ - facet::{FacetGroupKey, FacetGroupKeyCodec}, - StrRefCodec, - }, - update::{ - create_sorter, - del_add::{DelAdd, KvWriterDelAdd}, - MergeDeladdBtreesetString, - }, BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result, MAX_FACET_VALUE_LENGTH, }; -use super::{ - channel::FacetSearchableSender, extract::FacetKind, fst_merger_builder::FstMergerBuilder, - KvReaderDelAdd, -}; - pub struct FacetSearchBuilder<'indexer> { registered_facets: HashMap, normalized_facet_string_docids_sorter: Sorter, @@ -49,6 +42,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { None, None, Some(0), + false, ); Self { @@ -84,7 +78,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { } let locales = self.locales(field_id); - let hyper_normalized_value = normalize_facet_string(left_bound, locales.as_deref()); + let hyper_normalized_value = normalize_facet_string(left_bound, locales); let set = BTreeSet::from_iter(std::iter::once(left_bound)); @@ -103,7 +97,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { } fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> { - if self.localized_field_ids.get(&field_id).is_none() { + if !self.localized_field_ids.contains_key(&field_id) { let Some(field_name) = self.global_fields_ids_map.name(field_id) else { unreachable!("Field id {} not found in the global fields ids map", field_id); }; @@ -124,7 +118,8 @@ impl<'indexer> FacetSearchBuilder<'indexer> { pub fn merge_and_send( self, index: &Index, - rtxn: &RoTxn<'_>, + wtxn: &mut RwTxn, + rtxn: &RoTxn, sender: FacetSearchableSender, ) -> Result<()> { let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?; @@ -139,13 +134,14 @@ impl<'indexer> FacetSearchBuilder<'indexer> { let mut fst_merger_builder: Option = None; while let Some((key, deladd)) = merger_iter.next()? { let (field_id, normalized_facet_string) = - BEU16StrCodec::bytes_decode(&key).map_err(heed::Error::Encoding)?; + BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?; if current_field_id != Some(field_id) { if let Some(fst_merger_builder) = fst_merger_builder { // send the previous fst to the channel let mmap = fst_merger_builder.build(&mut callback)?; - sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap(); + // sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap(); + todo!("What to do"); } println!("getting fst for field_id: {}", field_id); @@ -198,7 +194,8 @@ impl<'indexer> FacetSearchBuilder<'indexer> { if let (Some(field_id), Some(fst_merger_builder)) = (current_field_id, fst_merger_builder) { let mmap = fst_merger_builder.build(&mut callback)?; - sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap(); + // sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap(); + todo!("What to do"); } Ok(()) @@ -209,7 +206,7 @@ fn callback(_bytes: &[u8], _deladd: DelAdd, _is_modified: bool) -> Result<()> { Ok(()) } -fn merge_btreesets<'a>( +fn merge_btreesets( current: Option<&[u8]>, del: Option<&[u8]>, add: Option<&[u8]>, diff --git a/milli/src/update/new/indexer/de.rs b/milli/src/update/new/indexer/de.rs index fa6b5fa76..3da4fc239 100644 --- a/milli/src/update/new/indexer/de.rs +++ b/milli/src/update/new/indexer/de.rs @@ -49,7 +49,7 @@ impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> Visitor<'de> visitor: MutFieldIdMapVisitor(self.fields_ids_map), })? { - let Some(fid) = fid else { + let Some(_fid) = fid else { return Ok(Err(crate::UserError::AttributeLimitReached)); }; self.fields_ids_map = fields_ids_map; diff --git a/milli/src/update/new/indexer/document_changes.rs b/milli/src/update/new/indexer/document_changes.rs index 423ddbdcc..a6bef9330 100644 --- a/milli/src/update/new/indexer/document_changes.rs +++ b/milli/src/update/new/indexer/document_changes.rs @@ -3,7 +3,6 @@ use std::sync::{Arc, RwLock}; use bumpalo::Bump; use heed::RoTxn; -use raw_collections::alloc::RefBump; use rayon::iter::IndexedParallelIterator; use super::super::document_change::DocumentChange; @@ -104,6 +103,10 @@ pub struct FullySend(pub T); // SAFETY: a type **fully** send is always mostly send as well. unsafe impl MostlySend for FullySend where T: Send {} +unsafe impl MostlySend for RefCell where T: MostlySend {} + +unsafe impl MostlySend for Option where T: MostlySend {} + impl FullySend { pub fn into(self) -> T { self.0 @@ -256,7 +259,7 @@ pub struct DocumentChangeContext< pub doc_alloc: Bump, /// Data allocated in this allocator is not cleared between each call to `process`, unless the data spills. - pub extractor_alloc: RefBump<'extractor>, + pub extractor_alloc: &'extractor Bump, /// Pool of doc allocators, used to retrieve the doc allocator we provided for the documents doc_allocs: &'doc ThreadLocal>>, @@ -279,14 +282,14 @@ impl< index: &'indexer Index, db_fields_ids_map: &'indexer FieldsIdsMap, new_fields_ids_map: &'fid RwLock, - extractor_allocs: &'extractor ThreadLocal>>, + extractor_allocs: &'extractor ThreadLocal>, doc_allocs: &'doc ThreadLocal>>, datastore: &'data ThreadLocal, fields_ids_map_store: &'doc ThreadLocal>>>, init_data: F, ) -> Result where - F: FnOnce(RefBump<'extractor>) -> Result, + F: FnOnce(&'extractor Bump) -> Result, { let doc_alloc = doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024)))); @@ -297,9 +300,7 @@ impl< let fields_ids_map = &fields_ids_map.0; let extractor_alloc = extractor_allocs.get_or_default(); - let extractor_alloc = RefBump::new(extractor_alloc.0.borrow_or_yield()); - - let data = datastore.get_or_try(|| init_data(RefBump::clone(&extractor_alloc)))?; + let data = datastore.get_or_try(move || init_data(&extractor_alloc.0))?; let txn = index.read_txn()?; Ok(DocumentChangeContext { @@ -308,7 +309,7 @@ impl< db_fields_ids_map, new_fields_ids_map: fields_ids_map, doc_alloc, - extractor_alloc, + extractor_alloc: &extractor_alloc.0, data, doc_allocs, }) @@ -319,7 +320,7 @@ impl< pub trait Extractor<'extractor>: Sync { type Data: MostlySend; - fn init_data<'doc>(&'doc self, extractor_alloc: RefBump<'extractor>) -> Result; + fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> Result; fn process<'doc>( &'doc self, @@ -375,15 +376,17 @@ pub fn for_each_document_change< doc_allocs, fields_ids_map_store, }: IndexingContext<'fid, 'indexer, 'index>, - extractor_allocs: &'extractor mut ThreadLocal>>, + extractor_allocs: &'extractor mut ThreadLocal>, datastore: &'data ThreadLocal, ) -> Result<()> where EX: Extractor<'extractor>, { + eprintln!("We are resetting the extractor allocators"); // Clean up and reuse the extractor allocs for extractor_alloc in extractor_allocs.iter_mut() { - extractor_alloc.0.get_mut().reset(); + eprintln!("\tWith {} bytes resetted", extractor_alloc.0.allocated_bytes()); + extractor_alloc.0.reset(); } let pi = document_changes.iter(); diff --git a/milli/src/update/new/indexer/document_deletion.rs b/milli/src/update/new/indexer/document_deletion.rs index a9628f419..c62f5c28f 100644 --- a/milli/src/update/new/indexer/document_deletion.rs +++ b/milli/src/update/new/indexer/document_deletion.rs @@ -80,7 +80,6 @@ mod test { use std::sync::RwLock; use bumpalo::Bump; - use raw_collections::alloc::RefBump; use crate::index::tests::TempIndex; use crate::update::new::indexer::document_changes::{ @@ -95,11 +94,7 @@ mod test { fn test_deletions() { struct DeletionWithData<'extractor> { deleted: RefCell< - hashbrown::HashSet< - DocumentId, - hashbrown::hash_map::DefaultHashBuilder, - RefBump<'extractor>, - >, + hashbrown::HashSet, >, } @@ -110,10 +105,7 @@ mod test { impl<'extractor> Extractor<'extractor> for TrackDeletion<'extractor> { type Data = DeletionWithData<'extractor>; - fn init_data( - &self, - extractor_alloc: raw_collections::alloc::RefBump<'extractor>, - ) -> crate::Result { + fn init_data(&self, extractor_alloc: &'extractor Bump) -> crate::Result { let deleted = RefCell::new(hashbrown::HashSet::new_in(extractor_alloc)); Ok(DeletionWithData { deleted }) } @@ -173,8 +165,7 @@ mod test { println!("deleted by {index}: {:?}", data.deleted.borrow()); } for alloc in extractor_allocs.iter_mut() { - let alloc = &mut alloc.0; - alloc.get_mut().reset(); + alloc.0.reset(); } } } diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 29ff2685e..1122d3ac9 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -1,26 +1,24 @@ -use std::cell::RefCell; +use std::cmp::Ordering; use std::sync::RwLock; use std::thread::{self, Builder}; use big_s::S; -use bumpalo::Bump; use document_changes::{ - for_each_document_change, DocumentChanges, Extractor, FullySend, IndexingContext, RefCellExt, - ThreadLocal, + for_each_document_change, DocumentChanges, FullySend, IndexingContext, ThreadLocal, }; pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; +use heed::types::{Bytes, DecodeIgnore, Str}; use heed::{RoTxn, RwTxn}; +use itertools::{merge_join_by, EitherOrBoth}; pub use partial_dump::PartialDump; use rayon::ThreadPool; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; use super::channel::*; -use super::document::write_to_obkv; -use super::document_change::DocumentChange; use super::extract::*; -use super::merger::{merge_grenad_entries, FacetFieldIdsDelta}; +use super::merger::{FacetDatabases, FacetFieldIdsDelta}; use super::word_fst_builder::PrefixDelta; use super::words_prefix_docids::{ compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, @@ -28,75 +26,23 @@ use super::words_prefix_docids::{ use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::facet::FacetType; +use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; use crate::proximity::ProximityPrecision; -use crate::update::new::channel::ExtractorSender; +use crate::update::del_add::DelAdd; +use crate::update::new::word_fst_builder::{PrefixData, WordFstBuilder}; use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; +use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids}; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; -pub(crate) mod de; +pub mod de; pub mod document_changes; mod document_deletion; mod document_operation; mod partial_dump; mod update_by_function; -struct DocumentExtractor<'a> { - document_sender: &'a DocumentSender<'a>, -} - -impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { - type Data = FullySend<()>; - - fn init_data( - &self, - _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, - ) -> Result { - Ok(FullySend(())) - } - - fn process( - &self, - change: DocumentChange, - context: &document_changes::DocumentChangeContext, - ) -> Result<()> { - let mut document_buffer = Vec::new(); - - let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield(); - let new_fields_ids_map = &*new_fields_ids_map; - let new_fields_ids_map = new_fields_ids_map.local_map(); - - let external_docid = change.external_docid().to_owned(); - - // document but we need to create a function that collects and compresses documents. - match change { - DocumentChange::Deletion(deletion) => { - let docid = deletion.docid(); - self.document_sender.delete(docid, external_docid).unwrap(); - } - /// TODO: change NONE by SOME(vector) when implemented - DocumentChange::Update(update) => { - let docid = update.docid(); - let content = - update.new(&context.txn, context.index, &context.db_fields_ids_map)?; - let content = - write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; - self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); - } - DocumentChange::Insertion(insertion) => { - let docid = insertion.docid(); - let content = insertion.new(); - let content = - write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; - self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); - // extracted_dictionary_sender.send(self, dictionary: &[u8]); - } - } - Ok(()) - } -} - /// This is the main function of this crate. /// /// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. @@ -114,12 +60,11 @@ pub fn index<'pl, 'indexer, 'index, DC>( where DC: DocumentChanges<'pl>, { - let (merger_sender, writer_receiver) = merger_writer_channel(10_000); - // This channel acts as a rendezvous point to ensure that we are one task ahead - let (extractor_sender, merger_receiver) = extractors_merger_channels(4); - + // TODO find a better channel limit + let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000); let new_fields_ids_map = RwLock::new(new_fields_ids_map); + let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads()); let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); @@ -132,124 +77,171 @@ where fields_ids_map_store: &fields_ids_map_store, }; - thread::scope(|s| { + thread::scope(|s| -> crate::Result<_> { let indexer_span = tracing::Span::current(); // TODO manage the errors correctly - let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { + let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { - let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); + let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); + let _entered = span.enter(); + + // document but we need to create a function that collects and compresses documents. + let rtxn = index.read_txn().unwrap(); + let document_sender = extractor_sender.documents(); + let document_extractor = DocumentsExtractor::new(&document_sender); + let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); + for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?; + + let mut documents_ids = index.documents_ids(&rtxn)?; + let delta_documents_ids = datastore.into_iter().map(|FullySend(d)| d.into_inner()).reduce(DelAddRoaringBitmap::merge).unwrap_or_default(); + delta_documents_ids.apply_to(&mut documents_ids); + extractor_sender.send_documents_ids(documents_ids).unwrap(); + + // document_sender.finish().unwrap(); + + const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; + let current_num_threads = rayon::current_num_threads(); + let max_memory = TEN_GIB / current_num_threads; + eprintln!("A maximum of {max_memory} bytes will be used for each of the {current_num_threads} threads"); + let grenad_parameters = GrenadParameters { + max_memory: Some(max_memory), + ..GrenadParameters::default() + }; + + let facet_field_ids_delta; + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); + let _entered = span.enter(); + facet_field_ids_delta = merge_and_send_facet_docids( + global_fields_ids_map, + FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?, + FacetDatabases::new(index), + index, + extractor_sender.facet_docids(), + )?; + } + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); let _entered = span.enter(); - // document but we need to create a function that collects and compresses documents. - let document_sender = extractor_sender.document_sender(); - let document_extractor = DocumentExtractor { document_sender: &document_sender}; - let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); - for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?; - - document_sender.finish().unwrap(); - - const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; - let max_memory = TEN_GIB / dbg!(rayon::current_num_threads()); - let grenad_parameters = GrenadParameters { - max_memory: Some(max_memory), - ..GrenadParameters::default() - }; + let WordDocidsCaches { + word_docids, + word_fid_docids, + exact_word_docids, + word_position_docids, + fid_word_count_docids, + } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?; + // TODO Word Docids Merger + // extractor_sender.send_searchable::(word_docids).unwrap(); { - let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); let _entered = span.enter(); - extract_and_send_docids::< - _, - FacetedDocidsExtractor, - FacetDocids, - >( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - &extractor_sender, - )?; - } - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); - let _entered = span.enter(); - - let WordDocidsMergers { - word_fid_docids, + merge_and_send_docids( word_docids, - exact_word_docids, - word_position_docids, - fid_word_count_docids, - } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?; - extractor_sender.send_searchable::(word_docids).unwrap(); - extractor_sender.send_searchable::(word_fid_docids).unwrap(); - extractor_sender.send_searchable::(exact_word_docids).unwrap(); - extractor_sender.send_searchable::(word_position_docids).unwrap(); - extractor_sender.send_searchable::(fid_word_count_docids).unwrap(); - } - - // run the proximity extraction only if the precision is by word - // this works only if the settings didn't change during this transaction. - let rtxn = index.read_txn().unwrap(); - let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); - if proximity_precision == ProximityPrecision::ByWord { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); - let _entered = span.enter(); - extract_and_send_docids::< - _, - WordPairProximityDocidsExtractor, - WordPairProximityDocids, - >( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - &extractor_sender, + index.word_docids.remap_types(), + index, + extractor_sender.docids::(), )?; } + // Word Fid Docids Merging + // extractor_sender.send_searchable::(word_fid_docids).unwrap(); { - let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); let _entered = span.enter(); + merge_and_send_docids( + word_fid_docids, + index.word_fid_docids.remap_types(), + index, + extractor_sender.docids::() + )?; } - // TODO THIS IS TOO MUCH - // - [ ] Extract fieldid docid facet number - // - [ ] Extract fieldid docid facet string - // - [ ] Extract facetid string fst - // - [ ] Extract facetid normalized string strings + // Exact Word Docids Merging + // extractor_sender.send_searchable::(exact_word_docids).unwrap(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); + let _entered = span.enter(); + merge_and_send_docids( + exact_word_docids, + index.exact_word_docids.remap_types(), + index, + extractor_sender.docids::(), + )?; + } - // TODO Inverted Indexes again - // - [x] Extract fieldid facet isempty docids - // - [x] Extract fieldid facet isnull docids - // - [x] Extract fieldid facet exists docids + // Word Position Docids Merging + // extractor_sender.send_searchable::(word_position_docids).unwrap(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_position_docids, + index.word_position_docids.remap_types(), + index, + extractor_sender.docids::(), + )?; + } - // TODO This is the normal system - // - [x] Extract fieldid facet number docids - // - [x] Extract fieldid facet string docids + // Fid Word Count Docids Merging + // extractor_sender.send_searchable::(fid_word_count_docids).unwrap(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); + let _entered = span.enter(); + merge_and_send_docids( + fid_word_count_docids, + index.field_id_word_count_docids.remap_types(), + index, + extractor_sender.docids::(), + )?; + } + } - Ok(()) as Result<_> - }) + // run the proximity extraction only if the precision is by word + // this works only if the settings didn't change during this transaction. + let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); + if proximity_precision == ProximityPrecision::ByWord { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); + let _entered = span.enter(); + let caches = ::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?; + merge_and_send_docids( + caches, + index.word_pair_proximity_docids.remap_types(), + index, + extractor_sender.docids::(), + )?; + } + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); + let _entered = span.enter(); + } + + // TODO THIS IS TOO MUCH + // - [ ] Extract fieldid docid facet number + // - [ ] Extract fieldid docid facet string + // - [ ] Extract facetid string fst + // - [ ] Extract facetid normalized string strings + + // TODO Inverted Indexes again + // - [x] Extract fieldid facet isempty docids + // - [x] Extract fieldid facet isnull docids + // - [x] Extract fieldid facet exists docids + + // TODO This is the normal system + // - [x] Extract fieldid facet number docids + // - [x] Extract fieldid facet string docids + + // TODO use None when needed + Result::Ok(facet_field_ids_delta) + }) })?; let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); - let indexer_span = tracing::Span::current(); - // TODO manage the errors correctly - let merger_thread = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || { - let span = - tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "merge"); - let _entered = span.enter(); - let rtxn = index.read_txn().unwrap(); - merge_grenad_entries( - merger_receiver, - merger_sender, - &rtxn, - index, - global_fields_ids_map, - ) - })?; for operation in writer_receiver { let database = operation.database(index); @@ -264,18 +256,66 @@ where } /// TODO handle the panicking threads - handle.join().unwrap()?; - let merger_result = merger_thread.join().unwrap()?; + let facet_field_ids_delta = extractor_handle.join().unwrap()?; - if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta { - compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; - } + let prefix_delta = { + let rtxn = index.read_txn()?; + let words_fst = index.words_fst(&rtxn)?; + let mut word_fst_builder = WordFstBuilder::new(&words_fst)?; + let prefix_settings = index.prefix_settings(&rtxn)?; + word_fst_builder.with_prefix_settings(prefix_settings); - if let Some(prefix_delta) = merger_result.prefix_delta { + let previous_words = index.word_docids.iter(&rtxn)?.remap_data_type::(); + let current_words = index.word_docids.iter(wtxn)?.remap_data_type::(); + for eob in merge_join_by(previous_words, current_words, |lhs, rhs| match (lhs, rhs) { + (Ok((l, _)), Ok((r, _))) => l.cmp(r), + (Err(_), _) | (_, Err(_)) => Ordering::Equal, + }) { + match eob { + EitherOrBoth::Both(lhs, rhs) => { + if let Some(e) = lhs.err().or(rhs.err()) { + return Err(e.into()); + } + } + EitherOrBoth::Left(result) => { + let (word, _) = result?; + word_fst_builder.register_word(DelAdd::Deletion, word.as_ref())?; + } + EitherOrBoth::Right(result) => { + let (word, _) = result?; + word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; + } + } + } + + let span = tracing::trace_span!(target: "indexing::documents::merge", "words_fst"); + let _entered = span.enter(); + + let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?; + // extractor_sender.main().write_words_fst(word_fst_mmap).unwrap(); + index.main.remap_types::().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?; + if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data { + // extractor_sender.main().write_words_prefixes_fst(prefixes_fst_mmap).unwrap(); + index.main.remap_types::().put( + wtxn, + WORDS_PREFIXES_FST_KEY, + &prefixes_fst_mmap, + )?; + Some(prefix_delta) + } else { + None + } + }; + + // if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta { + // compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; + // } + + if let Some(prefix_delta) = prefix_delta { compute_prefix_database(index, wtxn, prefix_delta)?; } - Ok(()) as Result<_> + Result::Ok(()) })?; // required to into_inner the new_fields_ids_map @@ -347,30 +387,6 @@ fn compute_facet_level_database( Ok(()) } -/// TODO: GrenadParameters::default() should be removed in favor a passed parameter -/// TODO: manage the errors correctly -/// TODO: we must have a single trait that also gives the extractor type -fn extract_and_send_docids< - 'pl, - 'fid, - 'indexer, - 'index, - DC: DocumentChanges<'pl>, - E: DocidsExtractor, - D: MergerOperationType, ->( - grenad_parameters: GrenadParameters, - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index>, - extractor_allocs: &mut ThreadLocal>>, - sender: &ExtractorSender, -) -> Result<()> { - let merger = - E::run_extraction(grenad_parameters, document_changes, indexing_context, extractor_allocs)?; - sender.send_searchable::(merger).unwrap(); - Ok(()) -} - /// Returns the primary key that has already been set for this index or the /// one we will guess by searching for the first key that contains "id" as a substring, /// and whether the primary key changed diff --git a/milli/src/update/new/indexer/partial_dump.rs b/milli/src/update/new/indexer/partial_dump.rs index 10fc95a03..991a90ab8 100644 --- a/milli/src/update/new/indexer/partial_dump.rs +++ b/milli/src/update/new/indexer/partial_dump.rs @@ -1,12 +1,10 @@ use std::ops::DerefMut; use rayon::iter::IndexedParallelIterator; -use serde::Deserializer; use serde_json::value::RawValue; -use super::de::FieldAndDocidExtractor; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend, RefCellExt}; -use crate::documents::{DocumentIdExtractionError, PrimaryKey}; +use crate::documents::PrimaryKey; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; use crate::update::new::document::DocumentFromVersions; use crate::update::new::document_change::Versions; diff --git a/milli/src/update/new/indexer/update_by_function.rs b/milli/src/update/new/indexer/update_by_function.rs index 826f918a4..8b34fec3d 100644 --- a/milli/src/update/new/indexer/update_by_function.rs +++ b/milli/src/update/new/indexer/update_by_function.rs @@ -1,5 +1,3 @@ -use std::collections::BTreeMap; - use raw_collections::RawMap; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; @@ -12,8 +10,8 @@ use crate::documents::PrimaryKey; use crate::error::{FieldIdMapMissingEntry, InternalError}; use crate::update::new::document::DocumentFromVersions; use crate::update::new::document_change::Versions; -use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, KvWriterFieldId, Update}; -use crate::{all_obkv_to_json, Error, FieldsIdsMap, GlobalFieldsIdsMap, Object, Result, UserError}; +use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update}; +use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; pub struct UpdateByFunction { documents: RoaringBitmap, diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 740b215e2..7b3dd85aa 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -1,222 +1,20 @@ -use std::fs::File; use std::io::{self}; use bincode::ErrorKind; -use grenad::Merger; use hashbrown::HashSet; use heed::types::Bytes; use heed::{Database, RoTxn}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; use super::channel::*; -use super::extract::FacetKind; -use super::facet_search_builder::FacetSearchBuilder; -use super::word_fst_builder::{PrefixData, PrefixDelta}; -use super::{Deletion, DocumentChange, KvReaderDelAdd, KvReaderFieldId}; -use crate::update::del_add::DelAdd; -use crate::update::new::channel::MergerOperation; -use crate::update::new::word_fst_builder::WordFstBuilder; -use crate::update::MergeDeladdCboRoaringBitmaps; -use crate::{ - localized_attributes_rules, CboRoaringBitmapCodec, Error, FieldId, GeoPoint, - GlobalFieldsIdsMap, Index, Result, +use super::extract::{ + merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind, }; - -/// TODO We must return some infos/stats -#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")] -pub fn merge_grenad_entries( - receiver: MergerReceiver, - sender: MergerSender, - rtxn: &RoTxn, - index: &Index, - global_fields_ids_map: GlobalFieldsIdsMap<'_>, -) -> Result { - let mut buffer: Vec = Vec::new(); - let mut documents_ids = index.documents_ids(rtxn)?; - let mut geo_extractor = GeoExtractor::new(rtxn, index)?; - let mut merger_result = MergerResult::default(); - - for merger_operation in receiver { - match merger_operation { - MergerOperation::ExactWordDocidsMerger(merger) => { - let span = - tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - merger, - /// TODO do a MergerOperation::database(&Index) -> Database. - index.exact_word_docids.remap_types(), - rtxn, - &mut buffer, - sender.docids::(), - |_, _key| Ok(()), - )?; - } - MergerOperation::FidWordCountDocidsMerger(merger) => { - let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); - let _entered = span.enter(); - merge_and_send_docids( - merger, - index.field_id_word_count_docids.remap_types(), - rtxn, - &mut buffer, - sender.docids::(), - |_, _key| Ok(()), - )?; - } - MergerOperation::WordDocidsMerger(merger) => { - let words_fst = index.words_fst(rtxn)?; - let mut word_fst_builder = WordFstBuilder::new(&words_fst)?; - let prefix_settings = index.prefix_settings(rtxn)?; - word_fst_builder.with_prefix_settings(prefix_settings); - - { - let span = - tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); - let _entered = span.enter(); - - merge_and_send_docids( - merger, - index.word_docids.remap_types(), - rtxn, - &mut buffer, - sender.docids::(), - |deladd, key| word_fst_builder.register_word(deladd, key), - )?; - } - - { - let span = - tracing::trace_span!(target: "indexing::documents::merge", "words_fst"); - let _entered = span.enter(); - - let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, rtxn)?; - sender.main().write_words_fst(word_fst_mmap).unwrap(); - if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data { - sender.main().write_words_prefixes_fst(prefixes_fst_mmap).unwrap(); - merger_result.prefix_delta = Some(prefix_delta); - } - } - } - MergerOperation::WordFidDocidsMerger(merger) => { - let span = - tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); - let _entered = span.enter(); - merge_and_send_docids( - merger, - index.word_fid_docids.remap_types(), - rtxn, - &mut buffer, - sender.docids::(), - |_, _key| Ok(()), - )?; - } - MergerOperation::WordPairProximityDocidsMerger(merger) => { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); - let _entered = span.enter(); - merge_and_send_docids( - merger, - index.word_pair_proximity_docids.remap_types(), - rtxn, - &mut buffer, - sender.docids::(), - |_, _key| Ok(()), - )?; - } - MergerOperation::WordPositionDocidsMerger(merger) => { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); - let _entered = span.enter(); - merge_and_send_docids( - merger, - index.word_position_docids.remap_types(), - rtxn, - &mut buffer, - sender.docids::(), - |_, _key| Ok(()), - )?; - } - MergerOperation::InsertDocument { docid, external_id, document } => { - let span = - tracing::trace_span!(target: "indexing::documents::merge", "insert_document"); - let _entered = span.enter(); - documents_ids.insert(docid); - sender.documents().uncompressed(docid, external_id.clone(), &document).unwrap(); - - if let Some(geo_extractor) = geo_extractor.as_mut() { - let current = index.documents.remap_data_type::().get(rtxn, &docid)?; - let current: Option<&KvReaderFieldId> = current.map(Into::into); - let change = match current { - Some(current) => DocumentChange::Update(todo!()), - None => DocumentChange::Insertion(todo!()), - }; - geo_extractor.manage_change(&mut global_fields_ids_map, &change)?; - } - } - MergerOperation::DeleteDocument { docid, external_id } => { - let span = - tracing::trace_span!(target: "indexing::documents::merge", "delete_document"); - let _entered = span.enter(); - if !documents_ids.remove(docid) { - unreachable!("Tried deleting a document that we do not know about"); - } - sender.documents().delete(docid, external_id.clone()).unwrap(); - - if let Some(geo_extractor) = geo_extractor.as_mut() { - let change = DocumentChange::Deletion(Deletion::create(docid, todo!())); - geo_extractor.manage_change(&mut global_fields_ids_map, &change)?; - } - } - MergerOperation::FinishedDocument => { - // send the rtree - } - MergerOperation::FacetDocidsMerger(merger) => { - let span = - tracing::trace_span!(target: "indexing::documents::merge", "facet_docids"); - let _entered = span.enter(); - let mut facet_field_ids_delta = FacetFieldIdsDelta::new(); - let localized_attributes_rules = - index.localized_attributes_rules(rtxn)?.unwrap_or_default(); - let mut facet_search_builder = FacetSearchBuilder::new( - global_fields_ids_map.clone(), - localized_attributes_rules, - ); - merge_and_send_facet_docids( - merger, - FacetDatabases::new(index), - rtxn, - &mut buffer, - sender.facet_docids(), - &mut facet_field_ids_delta, - &mut facet_search_builder, - )?; - - merger_result.facet_field_ids_delta = Some(facet_field_ids_delta); - // merge and send the facet fst and the searchable facet values - facet_search_builder.merge_and_send(index, rtxn, sender.facet_searchable())?; - } - } - } - - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "documents_ids"); - let _entered = span.enter(); - - // Send the documents ids unionized with the current one - sender.send_documents_ids(documents_ids).unwrap(); - } - - // ... - - Ok(merger_result) -} - -#[derive(Default, Debug)] -pub struct MergerResult { - /// The delta of the prefixes - pub prefix_delta: Option, - /// The field ids that have been modified - pub facet_field_ids_delta: Option, -} +use super::facet_search_builder::FacetSearchBuilder; +use super::DocumentChange; +use crate::update::del_add::DelAdd; +use crate::{CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, Result}; pub struct GeoExtractor { rtree: Option>, @@ -267,80 +65,92 @@ impl GeoExtractor { } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -fn merge_and_send_docids( - merger: Merger, +pub fn merge_and_send_docids<'extractor>( + mut caches: Vec>, database: Database, - rtxn: &RoTxn<'_>, - buffer: &mut Vec, - docids_sender: impl DocidsSender, - mut register_key: impl FnMut(DelAdd, &[u8]) -> Result<()>, + index: &Index, + docids_sender: impl DocidsSender + Sync, ) -> Result<()> { - let mut merger_iter = merger.into_stream_merger_iter().unwrap(); - while let Some((key, deladd)) = merger_iter.next().unwrap() { - let current = database.get(rtxn, key)?; - let deladd: &KvReaderDelAdd = deladd.into(); - let del = deladd.get(DelAdd::Deletion); - let add = deladd.get(DelAdd::Addition); - - match merge_cbo_bitmaps(current, del, add)? { - Operation::Write(bitmap) => { - let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer); - docids_sender.write(key, value).unwrap(); - register_key(DelAdd::Addition, key)?; + transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| { + let rtxn = index.read_txn()?; + let mut buffer = Vec::new(); + merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { + let current = database.get(&rtxn, key)?; + match merge_cbo_bitmaps(current, del, add)? { + Operation::Write(bitmap) => { + let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); + docids_sender.write(key, value).unwrap(); + Ok(()) + } + Operation::Delete => { + docids_sender.delete(key).unwrap(); + Ok(()) + } + Operation::Ignore => Ok(()), } - Operation::Delete => { - docids_sender.delete(key).unwrap(); - register_key(DelAdd::Deletion, key)?; - } - Operation::Ignore => (), - } - } - - Ok(()) + }) + }) } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -fn merge_and_send_facet_docids( - merger: Merger, +pub fn merge_and_send_facet_docids<'indexer, 'extractor>( + global_fields_ids_map: GlobalFieldsIdsMap<'indexer>, + mut caches: Vec>, database: FacetDatabases, - rtxn: &RoTxn<'_>, - buffer: &mut Vec, - docids_sender: impl DocidsSender, - facet_field_ids_delta: &mut FacetFieldIdsDelta, - facet_search_builder: &mut FacetSearchBuilder, -) -> Result<()> { - let mut merger_iter = merger.into_stream_merger_iter().unwrap(); - while let Some((key, deladd)) = merger_iter.next().unwrap() { - let current = database.get_cbo_roaring_bytes_value(rtxn, key)?; - let deladd: &KvReaderDelAdd = deladd.into(); - let del = deladd.get(DelAdd::Deletion); - let add = deladd.get(DelAdd::Addition); + index: &Index, + docids_sender: impl DocidsSender + Sync, +) -> Result<(FacetFieldIdsDelta, FacetSearchBuilder<'indexer>)> { + transpose_and_freeze_caches(&mut caches)? + .into_par_iter() + .map(|frozen| { + let mut facet_field_ids_delta = FacetFieldIdsDelta::default(); + let rtxn = index.read_txn()?; + let localized_attributes_rules = index.localized_attributes_rules(&rtxn)?; + let mut facet_search_builder = FacetSearchBuilder::new( + global_fields_ids_map.clone(), + localized_attributes_rules.unwrap_or_default(), + ); + let mut buffer = Vec::new(); + merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { + let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?; + match merge_cbo_bitmaps(current, del, add)? { + Operation::Write(bitmap) => { + facet_field_ids_delta.register_from_key(key); + facet_search_builder.register_from_key(DelAdd::Addition, key)?; + let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); + docids_sender.write(key, value).unwrap(); + Ok(()) + } + Operation::Delete => { + facet_field_ids_delta.register_from_key(key); + facet_search_builder.register_from_key(DelAdd::Deletion, key)?; + docids_sender.delete(key).unwrap(); + Ok(()) + } + Operation::Ignore => Ok(()), + } + })?; - match merge_cbo_bitmaps(current, del, add)? { - Operation::Write(bitmap) => { - facet_field_ids_delta.register_from_key(key); - facet_search_builder.register_from_key(DelAdd::Addition, key)?; - let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer); - docids_sender.write(key, value).unwrap(); - } - Operation::Delete => { - facet_field_ids_delta.register_from_key(key); - facet_search_builder.register_from_key(DelAdd::Deletion, key)?; - docids_sender.delete(key).unwrap(); - } - Operation::Ignore => (), - } - } - - Ok(()) + Ok((facet_field_ids_delta, facet_search_builder)) + }) + .reduce( + || Ok((FacetFieldIdsDelta::default(), todo!())), + |lhs, rhs| { + let (lhs_ffid, lhs_fsb) = lhs?; + let (rhs_ffid, rhs_fsb) = rhs?; + let ffid_merged = lhs_ffid.merge(rhs_ffid); + let fsb_merged = todo!(); + Ok((ffid_merged, fsb_merged)) + }, + ) } -struct FacetDatabases<'a> { +pub struct FacetDatabases<'a> { index: &'a Index, } impl<'a> FacetDatabases<'a> { - fn new(index: &'a Index) -> Self { + pub fn new(index: &'a Index) -> Self { Self { index } } @@ -361,7 +171,7 @@ impl<'a> FacetDatabases<'a> { } } -#[derive(Debug)] +#[derive(Debug, Default)] pub struct FacetFieldIdsDelta { /// The field ids that have been modified modified_facet_string_ids: HashSet, @@ -369,13 +179,6 @@ pub struct FacetFieldIdsDelta { } impl FacetFieldIdsDelta { - fn new() -> Self { - Self { - modified_facet_string_ids: HashSet::new(), - modified_facet_number_ids: HashSet::new(), - } - } - fn register_facet_string_id(&mut self, field_id: FieldId) { self.modified_facet_string_ids.insert(field_id); } @@ -414,6 +217,17 @@ impl FacetFieldIdsDelta { Some(self.modified_facet_number_ids.iter().copied().collect()) } } + + pub fn merge(mut self, rhs: Self) -> Self { + let Self { modified_facet_number_ids, modified_facet_string_ids } = rhs; + modified_facet_number_ids.into_iter().for_each(|fid| { + self.modified_facet_number_ids.insert(fid); + }); + modified_facet_string_ids.into_iter().for_each(|fid| { + self.modified_facet_string_ids.insert(fid); + }); + self + } } enum Operation { @@ -425,13 +239,10 @@ enum Operation { /// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap. fn merge_cbo_bitmaps( current: Option<&[u8]>, - del: Option<&[u8]>, - add: Option<&[u8]>, + del: Option, + add: Option, ) -> Result { let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; - let del = del.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; - let add = add.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; - match (current, del, add) { (None, None, None) => Ok(Operation::Ignore), // but it's strange (None, None, Some(add)) => Ok(Operation::Write(add)), diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index 16a6dd092..ee41bc0fd 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -1,4 +1,7 @@ pub use document_change::{Deletion, DocumentChange, Insertion, Update}; +pub use merger::{ + merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases, FacetFieldIdsDelta, +}; pub use top_level_map::{CowStr, TopLevelMap}; use super::del_add::DelAdd; diff --git a/milli/src/update/new/parallel_iterator_ext.rs b/milli/src/update/new/parallel_iterator_ext.rs index 043457cfd..ff69d7acf 100644 --- a/milli/src/update/new/parallel_iterator_ext.rs +++ b/milli/src/update/new/parallel_iterator_ext.rs @@ -1,38 +1,8 @@ use std::sync::Arc; -use rayon::iter::{MapInit, ParallelIterator}; +use rayon::iter::ParallelIterator; pub trait ParallelIteratorExt: ParallelIterator { - /// Maps items based on the init function. - /// - /// The init function is ran only as necessary which is basically once by thread. - fn try_map_try_init( - self, - init: INIT, - map_op: F, - ) -> MapInit< - Self, - impl Fn() -> Result> + Sync + Send + Clone, - impl Fn(&mut Result>, Self::Item) -> Result> + Sync + Send + Clone, - > - where - E: Send + Sync, - F: Fn(&mut T, Self::Item) -> Result + Sync + Send + Clone, - INIT: Fn() -> Result + Sync + Send + Clone, - R: Send, - { - self.map_init( - move || match init() { - Ok(t) => Ok(t), - Err(err) => Err(Arc::new(err)), - }, - move |result, item| match result { - Ok(t) => map_op(t, item).map_err(Arc::new), - Err(err) => Err(err.clone()), - }, - ) - } - /// A method to run a closure of all the items and return an owned error. /// /// The init function is ran only as necessary which is basically once by thread. @@ -58,17 +28,6 @@ pub trait ParallelIteratorExt: ParallelIterator { Err(err) => Err(Arc::into_inner(err).expect("the error must be only owned by us")), } } - - fn try_arc_for_each(self, op: F) -> Result<(), E> - where - E: Send + Sync, - F: Fn(Self::Item) -> Result<(), Arc> + Sync + Send + Clone, - { - match self.try_for_each(op) { - Ok(()) => Ok(()), - Err(err) => Err(Arc::into_inner(err).expect("the error must be only owned by us")), - } - } } impl ParallelIteratorExt for T {} diff --git a/milli/src/update/new/words_prefix_docids.rs b/milli/src/update/new/words_prefix_docids.rs index 38c2b1744..edc09c5f3 100644 --- a/milli/src/update/new/words_prefix_docids.rs +++ b/milli/src/update/new/words_prefix_docids.rs @@ -1,10 +1,16 @@ +use std::cell::RefCell; use std::collections::HashSet; +use std::io::{BufReader, BufWriter, Read, Seek, Write}; use hashbrown::HashMap; use heed::types::Bytes; -use heed::{BytesDecode, Database, RwTxn}; -use roaring::RoaringBitmap; +use heed::{BytesDecode, Database, RoTxn, RwTxn}; +use rayon::iter::{IntoParallelIterator, ParallelIterator as _}; +use roaring::MultiOps; +use tempfile::tempfile; +use thread_local::ThreadLocal; +use super::indexer::document_changes::RefCellExt; use crate::heed_codec::StrBEU16Codec; use crate::{CboRoaringBitmapCodec, Index, Prefix, Result}; @@ -38,22 +44,103 @@ impl WordPrefixDocids { prefixes: &HashSet, ) -> Result<()> { // We fetch the docids associated to the newly added word prefix fst only. - let mut docids = RoaringBitmap::new(); - for prefix in prefixes { - docids.clear(); - let prefix = prefix.as_bytes(); - for result in self.database.prefix_iter(wtxn, prefix)? { - let (_word, data) = result?; - docids |= &data; - } + // And collect the CboRoaringBitmaps pointers in an HashMap. + let frozen = FrozenPrefixBitmaps::from_prefixes(self.database, wtxn, prefixes)?; - self.prefix_database.put(wtxn, prefix, &docids)?; + // We access this HashMap in parallel to compute the *union* of all + // of them and *serialize* them into files. There is one file by CPU. + let local_entries = ThreadLocal::with_capacity(rayon::current_num_threads()); + prefixes.into_par_iter().map(AsRef::as_ref).try_for_each(|prefix| { + let refcell = local_entries.get_or_try(|| { + tempfile().map(BufWriter::new).map(|f| RefCell::new((Vec::new(), f, Vec::new()))) + })?; + + let mut refmut = refcell.borrow_mut_or_yield(); + let (ref mut index, ref mut file, ref mut buffer) = *refmut; + + let output = frozen + .bitmaps(prefix) + .unwrap() + .iter() + .map(|bytes| CboRoaringBitmapCodec::deserialize_from(bytes)) + .union()?; + + buffer.clear(); + CboRoaringBitmapCodec::serialize_into(&output, buffer); + index.push(PrefixEntry { prefix, serialized_length: buffer.len() }); + file.write_all(buffer) + })?; + + drop(frozen); + + // We iterate over all the collected and serialized bitmaps through + // the files and entries to eventually put them in the final database. + for refcell in local_entries { + let (index, file, mut buffer) = refcell.into_inner(); + let mut file = file.into_inner().map_err(|e| e.into_error())?; + file.rewind()?; + let mut file = BufReader::new(file); + for PrefixEntry { prefix, serialized_length } in index { + buffer.resize(serialized_length, 0); + file.read_exact(&mut buffer)?; + self.prefix_database.remap_data_type::().put( + wtxn, + prefix.as_bytes(), + &buffer, + )?; + } } Ok(()) } } +/// Represents a prefix and the lenght the bitmap takes on disk. +struct PrefixEntry<'a> { + prefix: &'a str, + serialized_length: usize, +} + +/// Stores prefixes along with all the pointers to the associated +/// CBoRoaringBitmaps. +/// +/// They are collected synchronously and stored into an HashMap. The +/// Synchronous process is doing a small amount of work by just storing +/// pointers. It can then be accessed in parallel to get the associated +/// bitmaps pointers. +struct FrozenPrefixBitmaps<'a, 'rtxn> { + prefixes_bitmaps: HashMap<&'a str, Vec<&'rtxn [u8]>>, +} + +impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> { + #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] + pub fn from_prefixes( + database: Database, + rtxn: &'rtxn RoTxn, + prefixes: &'a HashSet, + ) -> heed::Result { + let database = database.remap_data_type::(); + + let mut prefixes_bitmaps = HashMap::new(); + for prefix in prefixes { + let mut bitmap_bytes = Vec::new(); + for result in database.prefix_iter(rtxn, prefix.as_bytes())? { + let (_word, bytes) = result?; + bitmap_bytes.push(bytes); + } + assert!(prefixes_bitmaps.insert(prefix.as_str(), bitmap_bytes).is_none()); + } + + Ok(Self { prefixes_bitmaps }) + } + + pub fn bitmaps(&self, key: &str) -> Option<&[&'rtxn [u8]]> { + self.prefixes_bitmaps.get(key).map(AsRef::as_ref) + } +} + +unsafe impl<'a, 'rtxn> Sync for FrozenPrefixBitmaps<'a, 'rtxn> {} + struct WordPrefixIntegerDocids { database: Database, prefix_database: Database, From db556387143acfb27d8039d4dbac4f83a70ce9c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 5 Nov 2024 11:26:46 +0100 Subject: [PATCH 186/247] Do not forget to recompute common prefixes --- milli/src/update/new/indexer/mod.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 1122d3ac9..c2ca08c55 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -265,16 +265,18 @@ where let prefix_settings = index.prefix_settings(&rtxn)?; word_fst_builder.with_prefix_settings(prefix_settings); - let previous_words = index.word_docids.iter(&rtxn)?.remap_data_type::(); - let current_words = index.word_docids.iter(wtxn)?.remap_data_type::(); + let previous_words = index.word_docids.iter(&rtxn)?.remap_data_type::(); + let current_words = index.word_docids.iter(wtxn)?.remap_data_type::(); for eob in merge_join_by(previous_words, current_words, |lhs, rhs| match (lhs, rhs) { (Ok((l, _)), Ok((r, _))) => l.cmp(r), (Err(_), _) | (_, Err(_)) => Ordering::Equal, }) { match eob { EitherOrBoth::Both(lhs, rhs) => { - if let Some(e) = lhs.err().or(rhs.err()) { - return Err(e.into()); + let (word, lhs_bytes) = lhs?; + let (_, rhs_bytes) = rhs?; + if lhs_bytes != rhs_bytes { + word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; } } EitherOrBoth::Left(result) => { From 33b1f54b41ac85c5fa89ea9f49c80be323527183 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 5 Nov 2024 16:23:02 +0100 Subject: [PATCH 187/247] Progress, in the task queue --- index-scheduler/src/batch.rs | 59 ++++++++++----------------- index-scheduler/src/insta_snapshot.rs | 1 + index-scheduler/src/lib.rs | 28 +++++++++++-- index-scheduler/src/utils.rs | 2 + meilisearch-types/src/task_view.rs | 7 +++- meilisearch-types/src/tasks.rs | 56 +++++++++++++++++++++++++ 6 files changed, 111 insertions(+), 42 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 740528555..bd307b19e 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -22,7 +22,8 @@ use std::ffi::OsStr; use std::fmt; use std::fs::{self, File}; use std::io::BufWriter; -use std::sync::atomic::{self, AtomicU16, AtomicU32}; +use std::sync::atomic::{self, AtomicU64}; +use std::time::Duration; use bumpalo::collections::CollectIn; use bumpalo::Bump; @@ -31,7 +32,6 @@ use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; -use meilisearch_types::milli::update::new::indexer::document_changes::Progress; use meilisearch_types::milli::update::new::indexer::{ self, retrieve_or_guess_primary_key, UpdateByFunction, }; @@ -531,7 +531,7 @@ impl IndexScheduler { if let Some(task_id) = to_cancel.max() { // We retrieve the tasks that were processing before this tasks cancelation started. // We must *not* reset the processing tasks before calling this method. - let ProcessingTasks { started_at, processing } = + let ProcessingTasks { started_at, processing, progress: _ } = &*self.processing_tasks.read().unwrap(); return Ok(Some(Batch::TaskCancelation { task: self.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?, @@ -1223,39 +1223,29 @@ impl IndexScheduler { ) -> Result> { let indexer_alloc = Bump::new(); - let last_finished_steps = AtomicU16::new(0); - let last_finished_documents = AtomicU32::new(0); + let started_processing_at = std::time::Instant::now(); + let secs_since_started_processing_at = AtomicU64::new(0); + const PRINT_SECS_DELTA: u64 = 1; - let send_progress = - |Progress { finished_steps, total_steps, step_name, finished_total_documents }| { - /* - let current = rayon::current_thread_index(); + let processing_tasks = self.processing_tasks.clone(); - let last_finished_steps = - last_finished_steps.fetch_max(finished_steps, atomic::Ordering::Relaxed); + let must_stop_processing = self.must_stop_processing.clone(); - if last_finished_steps > finished_steps { - return; - } + let send_progress = |progress| { + let now = std::time::Instant::now(); + let elapsed = secs_since_started_processing_at.load(atomic::Ordering::Relaxed); + let previous = started_processing_at + Duration::from_secs(elapsed); + let elapsed = now - previous; - if let Some((finished_documents, total_documents)) = finished_total_documents { - if last_finished_steps < finished_steps { - last_finished_documents.store(finished_documents, atomic::Ordering::Relaxed); - } else { - let last_finished_documents = last_finished_documents - .fetch_max(finished_documents, atomic::Ordering::Relaxed); - if last_finished_documents > finished_documents { - return; - } - } - tracing::warn!("Progress from {current:?}: {step_name} ({finished_steps}/{total_steps}), document {finished_documents}/{total_documents}") - } else { - tracing::warn!( - "Progress from {current:?}: {step_name} ({finished_steps}/{total_steps})" - ) - } - */ - }; + if elapsed.as_secs() < PRINT_SECS_DELTA { + return; + } + + secs_since_started_processing_at + .store((now - started_processing_at).as_secs(), atomic::Ordering::Relaxed); + + processing_tasks.write().unwrap().update_progress(progress); + }; match operation { IndexOperation::DocumentClear { mut tasks, .. } => { @@ -1286,8 +1276,6 @@ impl IndexScheduler { operations, mut tasks, } => { - let started_processing_at = std::time::Instant::now(); - let must_stop_processing = self.must_stop_processing.clone(); let indexer_config = self.index_mapper.indexer_config(); // TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches. // this is made difficult by the fact we're doing private clones of the index scheduler and sending it @@ -1503,7 +1491,6 @@ impl IndexScheduler { let document_changes = indexer.into_changes(&primary_key)?; let embedders = index.embedding_configs(index_wtxn)?; let embedders = self.embedders(embedders)?; - let must_stop_processing = &self.must_stop_processing; indexer::index( index_wtxn, @@ -1645,7 +1632,6 @@ impl IndexScheduler { let document_changes = indexer.into_changes(&indexer_alloc, primary_key); let embedders = index.embedding_configs(index_wtxn)?; let embedders = self.embedders(embedders)?; - let must_stop_processing = &self.must_stop_processing; indexer::index( index_wtxn, @@ -1679,7 +1665,6 @@ impl IndexScheduler { task.status = Status::Succeeded; } - let must_stop_processing = self.must_stop_processing.clone(); builder.execute( |indexing_step| tracing::debug!(update = ?indexing_step), || must_stop_processing.get(), diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index f295e35b6..f63a289eb 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -148,6 +148,7 @@ pub fn snapshot_task(task: &Task) -> String { enqueued_at: _, started_at: _, finished_at: _, + progress: _, error, canceled_by, details, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index fe8244f9b..16b4a5897 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -55,11 +55,12 @@ use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::index::IndexEmbeddingConfig; +use meilisearch_types::milli::update::new::indexer::document_changes::Progress; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; use meilisearch_types::task_view::TaskView; -use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; +use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task, TaskProgress}; use rayon::current_num_threads; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; @@ -161,12 +162,18 @@ struct ProcessingTasks { started_at: OffsetDateTime, /// The list of tasks ids that are currently running. processing: RoaringBitmap, + /// The progress on processing tasks + progress: Option, } impl ProcessingTasks { /// Creates an empty `ProcessingAt` struct. fn new() -> ProcessingTasks { - ProcessingTasks { started_at: OffsetDateTime::now_utc(), processing: RoaringBitmap::new() } + ProcessingTasks { + started_at: OffsetDateTime::now_utc(), + processing: RoaringBitmap::new(), + progress: None, + } } /// Stores the currently processing tasks, and the date time at which it started. @@ -175,8 +182,13 @@ impl ProcessingTasks { self.processing = processing; } + fn update_progress(&mut self, progress: Progress) { + self.progress.get_or_insert_with(TaskProgress::default).update(progress); + } + /// Set the processing tasks to an empty list fn stop_processing(&mut self) -> RoaringBitmap { + self.progress = None; std::mem::take(&mut self.processing) } @@ -956,7 +968,7 @@ impl IndexScheduler { tasks.into_iter().rev().take(query.limit.unwrap_or(u32::MAX) as usize), )?; - let ProcessingTasks { started_at, processing, .. } = + let ProcessingTasks { started_at, processing, progress, .. } = self.processing_tasks.read().map_err(|_| Error::CorruptedTaskQueue)?.clone(); let ret = tasks.into_iter(); @@ -966,7 +978,12 @@ impl IndexScheduler { Ok(( ret.map(|task| { if processing.contains(task.uid) { - Task { status: Status::Processing, started_at: Some(started_at), ..task } + Task { + status: Status::Processing, + progress: progress.clone(), + started_at: Some(started_at), + ..task + } } else { task } @@ -1008,6 +1025,7 @@ impl IndexScheduler { enqueued_at: OffsetDateTime::now_utc(), started_at: None, finished_at: None, + progress: None, error: None, canceled_by: None, details: kind.default_details(), @@ -1588,6 +1606,8 @@ impl<'a> Dump<'a> { enqueued_at: task.enqueued_at, started_at: task.started_at, finished_at: task.finished_at, + /// FIXME: should we update dump to contain progress information? 🤔 + progress: None, error: task.error, canceled_by: task.canceled_by, details: task.details, diff --git a/index-scheduler/src/utils.rs b/index-scheduler/src/utils.rs index 788a70fb8..7ae419495 100644 --- a/index-scheduler/src/utils.rs +++ b/index-scheduler/src/utils.rs @@ -345,6 +345,8 @@ impl IndexScheduler { enqueued_at, started_at, finished_at, + /// FIXME: assert something here? ask tamo 🤔 + progress: _, error: _, canceled_by, details, diff --git a/meilisearch-types/src/task_view.rs b/meilisearch-types/src/task_view.rs index 3075fa899..fd9367bf4 100644 --- a/meilisearch-types/src/task_view.rs +++ b/meilisearch-types/src/task_view.rs @@ -4,7 +4,9 @@ use time::{Duration, OffsetDateTime}; use crate::error::ResponseError; use crate::settings::{Settings, Unchecked}; -use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId}; +use crate::tasks::{ + serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId, TaskProgress, +}; #[derive(Debug, Clone, PartialEq, Eq, Serialize)] #[serde(rename_all = "camelCase")] @@ -27,6 +29,8 @@ pub struct TaskView { pub started_at: Option, #[serde(with = "time::serde::rfc3339::option", default)] pub finished_at: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub progress: Option, } impl TaskView { @@ -43,6 +47,7 @@ impl TaskView { enqueued_at: task.enqueued_at, started_at: task.started_at, finished_at: task.finished_at, + progress: task.progress.clone(), } } } diff --git a/meilisearch-types/src/tasks.rs b/meilisearch-types/src/tasks.rs index 1dd6d3fbf..56d839432 100644 --- a/meilisearch-types/src/tasks.rs +++ b/meilisearch-types/src/tasks.rs @@ -4,6 +4,7 @@ use std::fmt::{Display, Write}; use std::str::FromStr; use enum_iterator::Sequence; +use milli::update::new::indexer::document_changes::Progress; use milli::update::IndexDocumentsMethod; use milli::Object; use roaring::RoaringBitmap; @@ -30,6 +31,8 @@ pub struct Task { #[serde(with = "time::serde::rfc3339::option")] pub finished_at: Option, + pub progress: Option, + pub error: Option, pub canceled_by: Option, pub details: Option
, @@ -38,6 +41,59 @@ pub struct Task { pub kind: KindWithContent, } +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct TaskProgress { + pub current_step: String, + pub finished_steps: u16, + pub total_steps: u16, + pub finished_documents: Option, + pub total_documents: Option, +} + +impl Default for TaskProgress { + fn default() -> Self { + Self::new() + } +} + +impl TaskProgress { + pub fn new() -> Self { + Self { + current_step: String::new(), + finished_steps: 0, + total_steps: 1, + finished_documents: None, + total_documents: None, + } + } + + pub fn update(&mut self, progress: Progress) { + if self.current_step != progress.step_name { + self.current_step.clear(); + self.current_step.push_str(progress.step_name); + } + self.total_steps = progress.total_steps; + if self.finished_steps > progress.finished_steps { + return; + } + if self.finished_steps < progress.finished_steps { + self.finished_documents = None; + self.total_documents = None; + } + self.finished_steps = progress.finished_steps; + if let Some((finished_documents, total_documents)) = progress.finished_total_documents { + if let Some(task_finished_documents) = self.finished_documents { + if task_finished_documents > finished_documents { + return; + } + } + self.finished_documents = Some(finished_documents); + self.total_documents = Some(total_documents); + } + } +} + impl Task { pub fn index_uid(&self) -> Option<&str> { use KindWithContent::*; From 8b260de5a068cb356d6f5fc1c829dcc8c1c87ed8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 5 Nov 2024 16:46:43 +0100 Subject: [PATCH 188/247] Reimplement facet search and facetr level and put them in dedidcated functions --- milli/src/update/new/channel.rs | 38 ----- milli/src/update/new/facet_search_builder.rs | 30 ++-- milli/src/update/new/indexer/mod.rs | 170 ++++++++++++------- milli/src/update/new/merger.rs | 27 +-- 4 files changed, 125 insertions(+), 140 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index af6e2215c..d6f2837b6 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -94,8 +94,6 @@ pub enum Database { FacetIdExistsDocids, FacetIdF64NumberDocids, FacetIdStringDocids, - FacetIdNormalizedStringStrings, - FacetIdStringFst, } impl Database { @@ -115,10 +113,6 @@ impl Database { Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(), Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(), Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), - Database::FacetIdNormalizedStringStrings => { - index.facet_id_normalized_string_strings.remap_types() - } - Database::FacetIdStringFst => index.facet_id_string_fst.remap_types(), } } } @@ -194,10 +188,6 @@ impl ExtractorSender { DocumentsSender(self) } - pub fn facet_searchable(&self) -> FacetSearchableSender<'_> { - FacetSearchableSender { sender: self } - } - pub fn send_documents_ids(&self, documents_ids: RoaringBitmap) -> StdResult<(), SendError<()>> { let entry = EntryOperation::Write(KeyValueEntry::from_small_key_bitmap( DOCUMENTS_IDS_KEY.as_bytes(), @@ -322,34 +312,6 @@ impl DocidsSender for FacetDocidsSender<'_> { } } -pub struct FacetSearchableSender<'a> { - sender: &'a ExtractorSender, -} - -impl FacetSearchableSender<'_> { - pub fn write_facet(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); - match self - .sender - .send(WriterOperation { database: Database::FacetIdNormalizedStringStrings, entry }) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } - - pub fn delete_facet(&self, key: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self - .sender - .send(WriterOperation { database: Database::FacetIdNormalizedStringStrings, entry }) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } -} - pub struct DocumentsSender<'a>(&'a ExtractorSender); impl DocumentsSender<'_> { diff --git a/milli/src/update/new/facet_search_builder.rs b/milli/src/update/new/facet_search_builder.rs index b9db80afb..839120540 100644 --- a/milli/src/update/new/facet_search_builder.rs +++ b/milli/src/update/new/facet_search_builder.rs @@ -6,7 +6,6 @@ use grenad::Sorter; use heed::types::{Bytes, SerdeJson}; use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn}; -use super::channel::FacetSearchableSender; use super::extract::FacetKind; use super::fst_merger_builder::FstMergerBuilder; use super::KvReaderDelAdd; @@ -42,7 +41,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { None, None, Some(0), - false, + true, ); Self { @@ -115,13 +114,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { } #[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")] - pub fn merge_and_send( - self, - index: &Index, - wtxn: &mut RwTxn, - rtxn: &RoTxn, - sender: FacetSearchableSender, - ) -> Result<()> { + pub fn merge_and_write(self, index: &Index, wtxn: &mut RwTxn, rtxn: &RoTxn) -> Result<()> { let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?; let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString); builder.extend(reader); @@ -138,24 +131,24 @@ impl<'indexer> FacetSearchBuilder<'indexer> { if current_field_id != Some(field_id) { if let Some(fst_merger_builder) = fst_merger_builder { - // send the previous fst to the channel let mmap = fst_merger_builder.build(&mut callback)?; - // sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap(); - todo!("What to do"); + index + .facet_id_string_fst + .remap_data_type::() + .put(wtxn, &field_id, &mmap)?; } - println!("getting fst for field_id: {}", field_id); fst = index.facet_id_string_fst.get(rtxn, &field_id)?; fst_merger_builder = Some(FstMergerBuilder::new(fst.as_ref())?); current_field_id = Some(field_id); } - let current = database.get(rtxn, key)?; + let previous = database.get(rtxn, key)?; let deladd: &KvReaderDelAdd = deladd.into(); let del = deladd.get(DelAdd::Deletion); let add = deladd.get(DelAdd::Addition); - match merge_btreesets(current, del, add)? { + match merge_btreesets(previous, del, add)? { Operation::Write(value) => { match fst_merger_builder.as_mut() { Some(fst_merger_builder) => { @@ -170,7 +163,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { let key = (field_id, normalized_facet_string); let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; - sender.write_facet(&key_bytes, &value).unwrap(); + database.put(wtxn, &key_bytes, &value)?; } Operation::Delete => { match fst_merger_builder.as_mut() { @@ -186,7 +179,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { let key = (field_id, normalized_facet_string); let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; - sender.delete_facet(&key_bytes).unwrap(); + database.delete(wtxn, &key_bytes)?; } Operation::Ignore => (), } @@ -194,8 +187,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { if let (Some(field_id), Some(fst_merger_builder)) = (current_field_id, fst_merger_builder) { let mmap = fst_merger_builder.build(&mut callback)?; - // sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap(); - todo!("What to do"); + index.facet_id_string_fst.remap_data_type::().put(wtxn, &field_id, &mmap)?; } Ok(()) diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index c2ca08c55..430313fbd 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -18,6 +18,7 @@ pub use update_by_function::UpdateByFunction; use super::channel::*; use super::extract::*; +use super::facet_search_builder::FacetSearchBuilder; use super::merger::{FacetDatabases, FacetFieldIdsDelta}; use super::word_fst_builder::PrefixDelta; use super::words_prefix_docids::{ @@ -114,7 +115,6 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); let _entered = span.enter(); facet_field_ids_delta = merge_and_send_facet_docids( - global_fields_ids_map, FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?, FacetDatabases::new(index), index, @@ -240,9 +240,6 @@ where }) })?; - let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); - let indexer_span = tracing::Span::current(); - for operation in writer_receiver { let database = operation.database(index); match operation.entry() { @@ -258,65 +255,14 @@ where /// TODO handle the panicking threads let facet_field_ids_delta = extractor_handle.join().unwrap()?; - let prefix_delta = { - let rtxn = index.read_txn()?; - let words_fst = index.words_fst(&rtxn)?; - let mut word_fst_builder = WordFstBuilder::new(&words_fst)?; - let prefix_settings = index.prefix_settings(&rtxn)?; - word_fst_builder.with_prefix_settings(prefix_settings); - - let previous_words = index.word_docids.iter(&rtxn)?.remap_data_type::(); - let current_words = index.word_docids.iter(wtxn)?.remap_data_type::(); - for eob in merge_join_by(previous_words, current_words, |lhs, rhs| match (lhs, rhs) { - (Ok((l, _)), Ok((r, _))) => l.cmp(r), - (Err(_), _) | (_, Err(_)) => Ordering::Equal, - }) { - match eob { - EitherOrBoth::Both(lhs, rhs) => { - let (word, lhs_bytes) = lhs?; - let (_, rhs_bytes) = rhs?; - if lhs_bytes != rhs_bytes { - word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; - } - } - EitherOrBoth::Left(result) => { - let (word, _) = result?; - word_fst_builder.register_word(DelAdd::Deletion, word.as_ref())?; - } - EitherOrBoth::Right(result) => { - let (word, _) = result?; - word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; - } - } - } - - let span = tracing::trace_span!(target: "indexing::documents::merge", "words_fst"); - let _entered = span.enter(); - - let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?; - // extractor_sender.main().write_words_fst(word_fst_mmap).unwrap(); - index.main.remap_types::().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?; - if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data { - // extractor_sender.main().write_words_prefixes_fst(prefixes_fst_mmap).unwrap(); - index.main.remap_types::().put( - wtxn, - WORDS_PREFIXES_FST_KEY, - &prefixes_fst_mmap, - )?; - Some(prefix_delta) - } else { - None - } - }; - - // if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta { - // compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; - // } - - if let Some(prefix_delta) = prefix_delta { + if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { compute_prefix_database(index, wtxn, prefix_delta)?; } + compute_facet_search_database(index, wtxn, global_fields_ids_map)?; + + compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; + Result::Ok(()) })?; @@ -358,6 +304,110 @@ fn compute_prefix_database( compute_word_prefix_position_docids(wtxn, index, &modified, &deleted) } +#[tracing::instrument(level = "trace", skip_all, target = "indexing")] +fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result> { + let rtxn = index.read_txn()?; + let words_fst = index.words_fst(&rtxn)?; + let mut word_fst_builder = WordFstBuilder::new(&words_fst)?; + let prefix_settings = index.prefix_settings(&rtxn)?; + word_fst_builder.with_prefix_settings(prefix_settings); + + let previous_words = index.word_docids.iter(&rtxn)?.remap_data_type::(); + let current_words = index.word_docids.iter(wtxn)?.remap_data_type::(); + for eob in merge_join_by(previous_words, current_words, |lhs, rhs| match (lhs, rhs) { + (Ok((l, _)), Ok((r, _))) => l.cmp(r), + (Err(_), _) | (_, Err(_)) => Ordering::Equal, + }) { + match eob { + EitherOrBoth::Both(lhs, rhs) => { + let (word, lhs_bytes) = lhs?; + let (_, rhs_bytes) = rhs?; + if lhs_bytes != rhs_bytes { + word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; + } + } + EitherOrBoth::Left(result) => { + let (word, _) = result?; + word_fst_builder.register_word(DelAdd::Deletion, word.as_ref())?; + } + EitherOrBoth::Right(result) => { + let (word, _) = result?; + word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; + } + } + } + + let span = tracing::trace_span!(target: "indexing::documents::merge", "words_fst"); + let _entered = span.enter(); + + let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?; + // extractor_sender.main().write_words_fst(word_fst_mmap).unwrap(); + index.main.remap_types::().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?; + if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data { + // extractor_sender.main().write_words_prefixes_fst(prefixes_fst_mmap).unwrap(); + index.main.remap_types::().put( + wtxn, + WORDS_PREFIXES_FST_KEY, + &prefixes_fst_mmap, + )?; + Ok(Some(prefix_delta)) + } else { + Ok(None) + } +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_search")] +fn compute_facet_search_database( + index: &Index, + wtxn: &mut RwTxn, + global_fields_ids_map: GlobalFieldsIdsMap, +) -> Result<()> { + let rtxn = index.read_txn()?; + let localized_attributes_rules = index.localized_attributes_rules(&rtxn)?; + let mut facet_search_builder = FacetSearchBuilder::new( + global_fields_ids_map, + localized_attributes_rules.unwrap_or_default(), + ); + + let previous_facet_id_string_docids = index + .facet_id_string_docids + .iter(&rtxn)? + .remap_data_type::() + .filter(|r| r.as_ref().map_or(true, |(k, _)| k.level == 0)); + let current_facet_id_string_docids = index + .facet_id_string_docids + .iter(wtxn)? + .remap_data_type::() + .filter(|r| r.as_ref().map_or(true, |(k, _)| k.level == 0)); + for eob in merge_join_by( + previous_facet_id_string_docids, + current_facet_id_string_docids, + |lhs, rhs| match (lhs, rhs) { + (Ok((l, _)), Ok((r, _))) => l.cmp(r), + (Err(_), _) | (_, Err(_)) => Ordering::Equal, + }, + ) { + match eob { + EitherOrBoth::Both(lhs, rhs) => { + let (_, _) = lhs?; + let (_, _) = rhs?; + } + EitherOrBoth::Left(result) => { + let (key, _) = result?; + facet_search_builder + .register_from_key(DelAdd::Deletion, key.left_bound.as_ref())?; + } + EitherOrBoth::Right(result) => { + let (key, _) = result?; + facet_search_builder + .register_from_key(DelAdd::Addition, key.left_bound.as_ref())?; + } + } + } + + facet_search_builder.merge_and_write(index, wtxn, &rtxn) +} + #[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_field_ids")] fn compute_facet_level_database( index: &Index, diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 7b3dd85aa..b1c5c5fd9 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -11,9 +11,7 @@ use super::channel::*; use super::extract::{ merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind, }; -use super::facet_search_builder::FacetSearchBuilder; use super::DocumentChange; -use crate::update::del_add::DelAdd; use crate::{CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, Result}; pub struct GeoExtractor { @@ -93,37 +91,29 @@ pub fn merge_and_send_docids<'extractor>( } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -pub fn merge_and_send_facet_docids<'indexer, 'extractor>( - global_fields_ids_map: GlobalFieldsIdsMap<'indexer>, +pub fn merge_and_send_facet_docids<'extractor>( mut caches: Vec>, database: FacetDatabases, index: &Index, docids_sender: impl DocidsSender + Sync, -) -> Result<(FacetFieldIdsDelta, FacetSearchBuilder<'indexer>)> { +) -> Result { transpose_and_freeze_caches(&mut caches)? .into_par_iter() .map(|frozen| { let mut facet_field_ids_delta = FacetFieldIdsDelta::default(); let rtxn = index.read_txn()?; - let localized_attributes_rules = index.localized_attributes_rules(&rtxn)?; - let mut facet_search_builder = FacetSearchBuilder::new( - global_fields_ids_map.clone(), - localized_attributes_rules.unwrap_or_default(), - ); let mut buffer = Vec::new(); merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { facet_field_ids_delta.register_from_key(key); - facet_search_builder.register_from_key(DelAdd::Addition, key)?; let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); docids_sender.write(key, value).unwrap(); Ok(()) } Operation::Delete => { facet_field_ids_delta.register_from_key(key); - facet_search_builder.register_from_key(DelAdd::Deletion, key)?; docids_sender.delete(key).unwrap(); Ok(()) } @@ -131,18 +121,9 @@ pub fn merge_and_send_facet_docids<'indexer, 'extractor>( } })?; - Ok((facet_field_ids_delta, facet_search_builder)) + Ok(facet_field_ids_delta) }) - .reduce( - || Ok((FacetFieldIdsDelta::default(), todo!())), - |lhs, rhs| { - let (lhs_ffid, lhs_fsb) = lhs?; - let (rhs_ffid, rhs_fsb) = rhs?; - let ffid_merged = lhs_ffid.merge(rhs_ffid); - let fsb_merged = todo!(); - Ok((ffid_merged, fsb_merged)) - }, - ) + .reduce(|| Ok(FacetFieldIdsDelta::default()), |lhs, rhs| Ok(lhs?.merge(rhs?))) } pub struct FacetDatabases<'a> { From a9ecbf0b64e7a11ea836f241c22d7d6840b1b3a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 6 Nov 2024 12:12:48 +0100 Subject: [PATCH 189/247] Use the Bbbul crate in the cache to better control memory --- Cargo.lock | 12 +- milli/src/update/new/extract/cache.rs | 194 +++++++++++++++++++++----- 2 files changed, 167 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 633fdca8f..b49000574 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -572,6 +572,15 @@ dependencies = [ "serde", ] +[[package]] +name = "bitpacking" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92" +dependencies = [ + "crunchy", +] + [[package]] name = "bitvec" version = "1.0.1" @@ -4430,9 +4439,10 @@ dependencies = [ [[package]] name = "raw-collections" version = "0.1.0" -source = "git+https://github.com/dureuill/raw-collections.git#4ab9619207632c20f4e0c2e126d9d909cc58ef65" +source = "git+https://github.com/dureuill/raw-collections.git#48801130cb16d758ba9c610b0fe25747e4d85534" dependencies = [ "allocator-api2", + "bitpacking", "bumpalo", "hashbrown 0.15.0", "serde", diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index a366435d8..63590db69 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -72,7 +72,9 @@ use bumpalo::Bump; use grenad::ReaderCursor; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; +use raw_collections::bbbul::{BitPacker, BitPacker4x}; use raw_collections::map::FrozenMap; +use raw_collections::{Bbbul, FrozenBbbul}; use roaring::RoaringBitmap; use rustc_hash::FxBuildHasher; @@ -130,7 +132,7 @@ impl<'extractor> BalancedCaches<'extractor> { Ok(()) } InnerCaches::Spilling(spilling) => { - spilling.insert_del_u32(&self.hasher, buckets, key, n) + spilling.insert_del_u32(&self.hasher, self.alloc, buckets, key, n) } } } @@ -147,7 +149,7 @@ impl<'extractor> BalancedCaches<'extractor> { Ok(()) } InnerCaches::Spilling(spilling) => { - spilling.insert_add_u32(&self.hasher, buckets, key, n) + spilling.insert_add_u32(&self.hasher, self.alloc, buckets, key, n) } } } @@ -165,7 +167,7 @@ impl<'extractor> BalancedCaches<'extractor> { ); let allocated: usize = normal_caches.caches.iter().map(|m| m.allocation_size()).sum(); - eprintln!("The last allocated HasMap took {allocated} bytes"); + eprintln!("The last allocated HashMap took {allocated} bytes"); let dummy = NormalCaches { caches: Vec::new() }; let NormalCaches { caches: cache_maps } = mem::replace(normal_caches, dummy); @@ -181,6 +183,24 @@ impl<'extractor> BalancedCaches<'extractor> { .iter_mut() .enumerate() .map(|(bucket, map)| { + // safety: we are transmuting the Bbbul into a FrozenBbbul + // that are the same size. + let map = unsafe { + std::mem::transmute::< + &mut HashMap< + &[u8], + DelAddBbbul, // from this + FxBuildHasher, + &Bump, + >, + &mut HashMap< + &[u8], + FrozenDelAddBbbul, // to that + FxBuildHasher, + &Bump, + >, + >(map) + }; Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() }) }) .collect(), @@ -196,6 +216,24 @@ impl<'extractor> BalancedCaches<'extractor> { .map(BufReader::new) .map(|bufreader| grenad::Reader::new(bufreader).map_err(Into::into)) .collect::>()?; + // safety: we are transmuting the Bbbul into a FrozenBbbul + // that are the same size. + let map = unsafe { + std::mem::transmute::< + &mut HashMap< + &[u8], + DelAddBbbul, // from this + FxBuildHasher, + &Bump, + >, + &mut HashMap< + &[u8], + FrozenDelAddBbbul, // to that + FxBuildHasher, + &Bump, + >, + >(map) + }; Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled }) }) .collect(), @@ -206,7 +244,14 @@ impl<'extractor> BalancedCaches<'extractor> { unsafe impl MostlySend for BalancedCaches<'_> {} struct NormalCaches<'extractor> { - caches: Vec>, + caches: Vec< + HashMap< + &'extractor [u8], + DelAddBbbul<'extractor, BitPacker4x>, + FxBuildHasher, + &'extractor Bump, + >, + >, } impl<'extractor> NormalCaches<'extractor> { @@ -223,13 +268,13 @@ impl<'extractor> NormalCaches<'extractor> { match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) { RawEntryMut::Occupied(mut entry) => { - entry.get_mut().del.get_or_insert_with(RoaringBitmap::default).insert(n); + entry.get_mut().del.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n); } RawEntryMut::Vacant(entry) => { entry.insert_hashed_nocheck( hash, alloc.alloc_slice_copy(key), - DelAddRoaringBitmap::new_del_u32(n), + DelAddBbbul::new_del_u32_in(n, alloc), ); } } @@ -247,13 +292,13 @@ impl<'extractor> NormalCaches<'extractor> { let bucket = compute_bucket_from_hash(buckets, hash); match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) { RawEntryMut::Occupied(mut entry) => { - entry.get_mut().add.get_or_insert_with(RoaringBitmap::default).insert(n); + entry.get_mut().add.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n); } RawEntryMut::Vacant(entry) => { entry.insert_hashed_nocheck( hash, alloc.alloc_slice_copy(key), - DelAddRoaringBitmap::new_add_u32(n), + DelAddBbbul::new_add_u32_in(n, alloc), ); } } @@ -261,7 +306,14 @@ impl<'extractor> NormalCaches<'extractor> { } struct SpillingCaches<'extractor> { - caches: Vec>, + caches: Vec< + HashMap< + &'extractor [u8], + DelAddBbbul<'extractor, BitPacker4x>, + FxBuildHasher, + &'extractor Bump, + >, + >, spilled_entries: Vec>, deladd_buffer: Vec, cbo_buffer: Vec, @@ -270,7 +322,12 @@ struct SpillingCaches<'extractor> { impl<'extractor> SpillingCaches<'extractor> { fn from_cache_maps( caches: Vec< - HashMap<&'extractor [u8], DelAddRoaringBitmap, FxBuildHasher, &'extractor Bump>, + HashMap< + &'extractor [u8], + DelAddBbbul<'extractor, BitPacker4x>, + FxBuildHasher, + &'extractor Bump, + >, >, ) -> SpillingCaches<'extractor> { SpillingCaches { @@ -291,6 +348,7 @@ impl<'extractor> SpillingCaches<'extractor> { pub fn insert_del_u32( &mut self, hasher: &FxBuildHasher, + alloc: &'extractor Bump, buckets: usize, key: &[u8], n: u32, @@ -299,25 +357,23 @@ impl<'extractor> SpillingCaches<'extractor> { let bucket = compute_bucket_from_hash(buckets, hash); match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) { RawEntryMut::Occupied(mut entry) => { - entry.get_mut().del.get_or_insert_with(RoaringBitmap::default).insert(n); + entry.get_mut().del.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n); Ok(()) } - RawEntryMut::Vacant(_entry) => { - let deladd = DelAddRoaringBitmap::new_del_u32(n); - spill_entry_to_sorter( - &mut self.spilled_entries[bucket], - &mut self.deladd_buffer, - &mut self.cbo_buffer, - key, - deladd, - ) - } + RawEntryMut::Vacant(_entry) => spill_entry_to_sorter( + &mut self.spilled_entries[bucket], + &mut self.deladd_buffer, + &mut self.cbo_buffer, + key, + DelAddRoaringBitmap::new_del_u32(n), + ), } } pub fn insert_add_u32( &mut self, hasher: &FxBuildHasher, + alloc: &'extractor Bump, buckets: usize, key: &[u8], n: u32, @@ -326,19 +382,16 @@ impl<'extractor> SpillingCaches<'extractor> { let bucket = compute_bucket_from_hash(buckets, hash); match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) { RawEntryMut::Occupied(mut entry) => { - entry.get_mut().add.get_or_insert_with(RoaringBitmap::default).insert(n); + entry.get_mut().add.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n); Ok(()) } - RawEntryMut::Vacant(_entry) => { - let deladd = DelAddRoaringBitmap::new_add_u32(n); - spill_entry_to_sorter( - &mut self.spilled_entries[bucket], - &mut self.deladd_buffer, - &mut self.cbo_buffer, - key, - deladd, - ) - } + RawEntryMut::Vacant(_entry) => spill_entry_to_sorter( + &mut self.spilled_entries[bucket], + &mut self.deladd_buffer, + &mut self.cbo_buffer, + key, + DelAddRoaringBitmap::new_add_u32(n), + ), } } } @@ -387,7 +440,13 @@ fn spill_entry_to_sorter( pub struct FrozenCache<'a, 'extractor> { bucket: usize, - cache: FrozenMap<'a, 'extractor, &'extractor [u8], DelAddRoaringBitmap, FxBuildHasher>, + cache: FrozenMap< + 'a, + 'extractor, + &'extractor [u8], + FrozenDelAddBbbul<'extractor, BitPacker4x>, + FxBuildHasher, + >, spilled: Vec>>, } @@ -467,7 +526,7 @@ where for (map_index, map) in maps.iter_mut().enumerate() { if first_entry.source_index != map_index { if let Some(new) = map.get_mut(first_key) { - output = output.merge(mem::take(new)); + output.append_and_clear_bbbul(new); } } } @@ -483,14 +542,15 @@ where // Then manage the content on the HashMap entries that weren't taken (mem::take). while let Some(mut map) = maps.pop() { - for (key, output) in map.iter_mut() { - let mut output = mem::take(output); + for (key, bbbul) in map.iter_mut() { + let mut output = DelAddRoaringBitmap::empty(); + output.append_and_clear_bbbul(bbbul); // Make sure we don't try to work with entries already managed by the spilled - if !output.is_empty() { + if !bbbul.is_empty() { for rhs in maps.iter_mut() { if let Some(new) = rhs.get_mut(key) { - output = output.merge(mem::take(new)); + output.append_and_clear_bbbul(new); } } @@ -530,6 +590,44 @@ impl PartialOrd for Entry { } } +pub struct DelAddBbbul<'bump, B> { + pub del: Option>, + pub add: Option>, +} + +impl<'bump, B: BitPacker> DelAddBbbul<'bump, B> { + pub fn insert_del_u32_in(&mut self, n: u32, bump: &'bump Bump) { + self.del.get_or_insert_with(|| Bbbul::new_in(bump)).insert(n); + } + + pub fn insert_add_u32_in(&mut self, n: u32, bump: &'bump Bump) { + self.add.get_or_insert_with(|| Bbbul::new_in(bump)).insert(n); + } + + pub fn new_del_u32_in(n: u32, bump: &'bump Bump) -> Self { + let mut bbbul = Bbbul::new_in(bump); + bbbul.insert(n); + DelAddBbbul { del: Some(bbbul), add: None } + } + + pub fn new_add_u32_in(n: u32, bump: &'bump Bump) -> Self { + let mut bbbul = Bbbul::new_in(bump); + bbbul.insert(n); + DelAddBbbul { del: None, add: Some(bbbul) } + } +} + +pub struct FrozenDelAddBbbul<'bump, B> { + pub del: Option>, + pub add: Option>, +} + +impl<'bump, B> FrozenDelAddBbbul<'bump, B> { + fn is_empty(&self) -> bool { + self.del.is_none() && self.add.is_none() + } +} + #[derive(Debug, Default, Clone)] pub struct DelAddRoaringBitmap { pub del: Option, @@ -578,6 +676,26 @@ impl DelAddRoaringBitmap { DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) } } + pub fn append_and_clear_bbbul(&mut self, bbbul: &mut FrozenDelAddBbbul<'_, B>) { + let FrozenDelAddBbbul { del, add } = bbbul; + + if let Some(ref mut bbbul) = del.take() { + let del = self.del.get_or_insert_with(RoaringBitmap::new); + let mut iter = bbbul.iter_and_clear(); + while let Some(block) = iter.next_block() { + del.append(block.iter().copied()); + } + } + + if let Some(ref mut bbbul) = add.take() { + let add = self.add.get_or_insert_with(RoaringBitmap::new); + let mut iter = bbbul.iter_and_clear(); + while let Some(block) = iter.next_block() { + add.append(block.iter().copied()); + } + } + } + pub fn merge(self, rhs: DelAddRoaringBitmap) -> DelAddRoaringBitmap { let DelAddRoaringBitmap { del, add } = self; let DelAddRoaringBitmap { del: ndel, add: nadd } = rhs; From 10f49f0d75f038b3fe240e868300f6728414b456 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 6 Nov 2024 17:50:12 +0100 Subject: [PATCH 190/247] Post processing of the merge --- crates/meilitool/src/main.rs | 2 +- crates/milli/src/update/new/channel.rs | 30 +- crates/milli/src/update/new/document.rs | 4 +- .../milli/src/update/new/extract/documents.rs | 145 ++- .../new/extract/faceted/extract_facets.rs | 2 +- crates/milli/src/update/new/extract/mod.rs | 4 +- .../extract/searchable/extract_word_docids.rs | 45 +- .../src/update/new/extract/vectors/mod.rs | 32 +- crates/milli/src/update/new/indexer/mod.rs | 405 +++++++-- .../milli/src/update/new/vector_document.rs | 14 +- crates/milli/src/vector/error.rs | 2 +- milli/src/update/new/indexer/mod.rs | 823 ------------------ 12 files changed, 512 insertions(+), 996 deletions(-) delete mode 100644 milli/src/update/new/indexer/mod.rs diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index 978824356..f84cea98d 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -264,7 +264,7 @@ fn export_a_dump( format!("While iterating on content file {:?}", content_file_uuid) })? { dump_content_file - .push_document(&obkv_to_object(&doc, &documents_batch_index)?)?; + .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; } dump_content_file.flush()?; count += 1; diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index dee82e6d9..a4896ee3f 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -2,16 +2,13 @@ use std::marker::PhantomData; use std::sync::atomic::{AtomicUsize, Ordering}; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; -use grenad::Merger; use hashbrown::HashMap; use heed::types::Bytes; use roaring::RoaringBitmap; use super::extract::FacetKind; use super::StdResult; -use crate::index::main_key::DOCUMENTS_IDS_KEY; use crate::update::new::KvReaderFieldId; -use crate::update::MergeDeladdCboRoaringBitmaps; use crate::vector::Embedding; use crate::{DocumentId, Index}; @@ -41,14 +38,6 @@ impl KeyValueEntry { data.extend_from_slice(value); KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() } } - - pub fn from_small_key_bitmap(key: &[u8], bitmap: RoaringBitmap) -> Self { - let mut data = Vec::with_capacity(key.len() + bitmap.serialized_size()); - data.extend_from_slice(key); - bitmap.serialize_into(&mut data).unwrap(); - KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() } - } - pub fn key(&self) -> &[u8] { &self.data[..self.key_length] } @@ -113,7 +102,6 @@ pub enum Database { ExternalDocumentsIds, ExactWordDocids, FidWordCountDocids, - Main, WordDocids, WordFidDocids, WordPairProximityDocids, @@ -131,7 +119,6 @@ impl Database { Database::Documents => index.documents.remap_types(), Database::ExternalDocumentsIds => index.external_documents_ids.remap_types(), Database::ExactWordDocids => index.exact_word_docids.remap_types(), - Database::Main => index.main.remap_types(), Database::WordDocids => index.word_docids.remap_types(), Database::WordFidDocids => index.word_fid_docids.remap_types(), Database::WordPositionDocids => index.word_position_docids.remap_types(), @@ -217,12 +204,15 @@ impl ExtractorSender { DocumentsSender(self) } - pub fn send_documents_ids(&self, documents_ids: RoaringBitmap) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_bitmap( - DOCUMENTS_IDS_KEY.as_bytes(), - documents_ids, - )); - match self.send_db_operation(DbOperation { database: Database::Main, entry }) { + pub fn embeddings(&self) -> EmbeddingSender<'_> { + EmbeddingSender(&self.sender) + } + + fn send_delete_vector(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { + match self + .sender + .send(WriterOperation::ArroyOperation(ArroyOperation::DeleteVectors { docid })) + { Ok(()) => Ok(()), Err(SendError(_)) => Err(SendError(())), } @@ -381,6 +371,8 @@ impl DocumentsSender<'_> { Err(SendError(_)) => Err(SendError(())), }?; + self.0.send_delete_vector(docid)?; + let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes())); match self .0 diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index 068268c4e..14e4f72e5 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -286,11 +286,11 @@ where /// /// - If the document contains a top-level field that is not present in `fields_ids_map`. /// -pub fn write_to_obkv<'s, 'a, 'map>( +pub fn write_to_obkv<'s, 'a, 'map, 'buffer>( document: &'s impl Document<'s>, vector_document: Option<&'s impl VectorDocument<'s>>, fields_ids_map: &'a mut GlobalFieldsIdsMap<'map>, - mut document_buffer: &'a mut Vec, + mut document_buffer: &'a mut bumpalo::collections::Vec<'buffer, u8>, ) -> Result<&'a KvReaderFieldId> where 's: 'a, diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 21fe4d518..79e1a2462 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -1,73 +1,140 @@ use std::cell::RefCell; use bumpalo::Bump; +use hashbrown::HashMap; use super::DelAddRoaringBitmap; use crate::update::new::channel::DocumentsSender; -use crate::update::new::document::write_to_obkv; +use crate::update::new::document::{write_to_obkv, Document as _}; use crate::update::new::indexer::document_changes::{ DocumentChangeContext, Extractor, FullySend, RefCellExt as _, }; use crate::update::new::DocumentChange; +use crate::vector::EmbeddingConfigs; use crate::Result; - pub struct DocumentsExtractor<'a> { - documents_sender: &'a DocumentsSender<'a>, + document_sender: &'a DocumentsSender<'a>, + embedders: &'a EmbeddingConfigs, } impl<'a> DocumentsExtractor<'a> { - pub fn new(documents_sender: &'a DocumentsSender<'a>) -> Self { - Self { documents_sender } + pub fn new(document_sender: &'a DocumentsSender<'a>, embedders: &'a EmbeddingConfigs) -> Self { + Self { document_sender, embedders } } } +#[derive(Default)] +pub struct DocumentExtractorData { + pub docids_delta: DelAddRoaringBitmap, + pub field_distribution_delta: HashMap, +} + impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { - type Data = FullySend>; + type Data = FullySend>; fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result { - Ok(FullySend(RefCell::new(DelAddRoaringBitmap::empty()))) + Ok(FullySend(Default::default())) } - fn process( + fn process<'doc>( &self, - change: DocumentChange, + changes: impl Iterator>>, context: &DocumentChangeContext, ) -> Result<()> { - let mut document_buffer = Vec::new(); - let mut delta_documents_ids = context.data.0.borrow_mut_or_yield(); + let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc); + let mut document_extractor_data = context.data.0.borrow_mut_or_yield(); - let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield(); - let new_fields_ids_map = &*new_fields_ids_map; - let new_fields_ids_map = new_fields_ids_map.local_map(); + let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); - let external_docid = change.external_docid().to_owned(); + for change in changes { + let change = change?; + let external_docid = change.external_docid().to_owned(); - // document but we need to create a function that collects and compresses documents. - match change { - DocumentChange::Deletion(deletion) => { - let docid = deletion.docid(); - self.documents_sender.delete(docid, external_docid).unwrap(); - delta_documents_ids.insert_del_u32(docid); - } - /// TODO: change NONE by SOME(vector) when implemented - DocumentChange::Update(update) => { - let docid = update.docid(); - let content = - update.new(&context.txn, context.index, &context.db_fields_ids_map)?; - let content = - write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; - self.documents_sender.uncompressed(docid, external_docid, content).unwrap(); - } - DocumentChange::Insertion(insertion) => { - let docid = insertion.docid(); - let content = insertion.new(); - let content = - write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?; - self.documents_sender.uncompressed(docid, external_docid, content).unwrap(); - delta_documents_ids.insert_add_u32(docid); - // extracted_dictionary_sender.send(self, dictionary: &[u8]); + // document but we need to create a function that collects and compresses documents. + match change { + DocumentChange::Deletion(deletion) => { + let docid = deletion.docid(); + let content = deletion.current( + &context.txn, + context.index, + &context.db_fields_ids_map, + )?; + for res in content.iter_top_level_fields() { + let (f, _) = res?; + let entry = document_extractor_data + .field_distribution_delta + .entry_ref(f) + .or_default(); + *entry -= 1; + } + document_extractor_data.docids_delta.insert_del_u32(docid); + self.document_sender.delete(docid, external_docid).unwrap(); + } + DocumentChange::Update(update) => { + let docid = update.docid(); + let content = + update.current(&context.txn, context.index, &context.db_fields_ids_map)?; + for res in content.iter_top_level_fields() { + let (f, _) = res?; + let entry = document_extractor_data + .field_distribution_delta + .entry_ref(f) + .or_default(); + *entry -= 1; + } + let content = update.updated(); + for res in content.iter_top_level_fields() { + let (f, _) = res?; + let entry = document_extractor_data + .field_distribution_delta + .entry_ref(f) + .or_default(); + *entry += 1; + } + + let content = + update.merged(&context.txn, context.index, &context.db_fields_ids_map)?; + let vector_content = update.merged_vectors( + &context.txn, + context.index, + &context.db_fields_ids_map, + &context.doc_alloc, + self.embedders, + )?; + let content = write_to_obkv( + &content, + vector_content.as_ref(), + &mut new_fields_ids_map, + &mut document_buffer, + )?; + self.document_sender.uncompressed(docid, external_docid, content).unwrap(); + } + DocumentChange::Insertion(insertion) => { + let docid = insertion.docid(); + let content = insertion.inserted(); + for res in content.iter_top_level_fields() { + let (f, _) = res?; + let entry = document_extractor_data + .field_distribution_delta + .entry_ref(f) + .or_default(); + *entry += 1; + } + let inserted_vectors = + insertion.inserted_vectors(&context.doc_alloc, self.embedders)?; + let content = write_to_obkv( + &content, + inserted_vectors.as_ref(), + &mut new_fields_ids_map, + &mut document_buffer, + )?; + document_extractor_data.docids_delta.insert_add_u32(docid); + self.document_sender.uncompressed(docid, external_docid, content).unwrap(); + // extracted_dictionary_sender.send(self, dictionary: &[u8]); + } } } + Ok(()) } } diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 108e4d422..1aaae1cb8 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -228,7 +228,7 @@ impl DocidsExtractor for FacetedDocidsExtractor { grenad_parameters: GrenadParameters, document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, - extractor_allocs: &mut ThreadLocal>, + extractor_allocs: &'extractor mut ThreadLocal>, finished_steps: u16, total_steps: u16, step_name: &'static str, diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index 7f6b72c93..af6a29d07 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -14,7 +14,7 @@ pub use vectors::EmbeddingExtractor; use super::indexer::document_changes::{ DocumentChanges, FullySend, IndexingContext, Progress, ThreadLocal, }; -use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}; +use crate::update::GrenadParameters; use crate::Result; pub trait DocidsExtractor { @@ -26,7 +26,7 @@ pub trait DocidsExtractor { finished_steps: u16, total_steps: u16, step_name: &'static str, - ) -> Result> + ) -> Result>> where MSP: Fn() -> bool + Sync, SP: Fn(Progress) + Sync; diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 23bca784f..cadea7251 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -11,8 +11,8 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ - for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend, - IndexingContext, MostlySend, RefCellExt, ThreadLocal, + extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, + MostlySend, Progress, RefCellExt, ThreadLocal, }; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; @@ -218,24 +218,44 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { )))) } - fn process( + fn process<'doc>( &self, - change: DocumentChange, + changes: impl Iterator>>, context: &DocumentChangeContext, ) -> Result<()> { - WordDocidsExtractors::extract_document_change(context, self.tokenizer, change) + for change in changes { + let change = change?; + WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?; + } + Ok(()) } } pub struct WordDocidsExtractors; impl WordDocidsExtractors { - pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>( + pub fn run_extraction< + 'pl, + 'fid, + 'indexer, + 'index, + 'extractor, + DC: DocumentChanges<'pl>, + MSP, + SP, + >( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &'extractor mut ThreadLocal>, - ) -> Result> { + finished_steps: u16, + total_steps: u16, + step_name: &'static str, + ) -> Result> + where + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, + { let index = indexing_context.index; let rtxn = index.read_txn()?; @@ -279,12 +299,15 @@ impl WordDocidsExtractors { buckets: rayon::current_num_threads(), }; - for_each_document_change( + extract( document_changes, &extractor, indexing_context, extractor_allocs, &datastore, + finished_steps, + total_steps, + step_name, )?; } @@ -358,7 +381,7 @@ impl WordDocidsExtractors { ) }; document_tokenizer.tokenize_document( - inner.new(rtxn, index, context.db_fields_ids_map)?, + inner.merged(rtxn, index, context.db_fields_ids_map)?, new_fields_ids_map, &mut token_fn, )?; @@ -375,7 +398,7 @@ impl WordDocidsExtractors { ) }; document_tokenizer.tokenize_document( - inner.new(), + inner.inserted(), new_fields_ids_map, &mut token_fn, )?; diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 70bd4d42d..a5cf915e4 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -8,7 +8,7 @@ use super::cache::DelAddRoaringBitmap; use crate::error::FaultSource; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; -use crate::update::new::indexer::document_changes::{Extractor, FullySend}; +use crate::update::new::indexer::document_changes::{Extractor, MostlySend}; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; use crate::vector::error::{ @@ -36,15 +36,17 @@ impl<'a> EmbeddingExtractor<'a> { } } -impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { - type Data = FullySend>>; +pub struct EmbeddingExtractorData<'extractor>( + pub HashMap, +); - fn init_data<'doc>( - &'doc self, - _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, - ) -> crate::Result { - /// TODO: use the extractor_alloc in the hashbrown once you merge the branch where it is no longer a RefBump - Ok(FullySend(Default::default())) +unsafe impl MostlySend for EmbeddingExtractorData<'_> {} + +impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { + type Data = RefCell>; + + fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result { + Ok(RefCell::new(EmbeddingExtractorData(HashMap::new_in(extractor_alloc)))) } fn process<'doc>( @@ -72,7 +74,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { embedder_id, embedder_name, prompt, - &context.data.0, + context.data, &self.possible_embedding_mistakes, self.threads, self.sender, @@ -252,7 +254,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { // Currently this is the case as: // 1. BVec are inside of the bumaplo // 2. All other fields are either trivial (u8) or references. -struct Chunks<'a> { +struct Chunks<'a, 'extractor> { texts: BVec<'a, &'a str>, ids: BVec<'a, DocumentId>, @@ -261,19 +263,19 @@ struct Chunks<'a> { embedder_name: &'a str, prompt: &'a Prompt, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - user_provided: &'a RefCell>, + user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, sender: &'a EmbeddingSender<'a>, } -impl<'a> Chunks<'a> { +impl<'a, 'extractor> Chunks<'a, 'extractor> { #[allow(clippy::too_many_arguments)] pub fn new( embedder: &'a Embedder, embedder_id: u8, embedder_name: &'a str, prompt: &'a Prompt, - user_provided: &'a RefCell>, + user_provided: &'a RefCell>, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, sender: &'a EmbeddingSender<'a>, @@ -417,7 +419,7 @@ impl<'a> Chunks<'a> { fn set_regenerate(&self, docid: DocumentId, regenerate: bool) { let mut user_provided = self.user_provided.borrow_mut(); - let user_provided = user_provided.entry_ref(self.embedder_name).or_default(); + let user_provided = user_provided.0.entry_ref(self.embedder_name).or_default(); if regenerate { // regenerate == !user_provided user_provided.del.get_or_insert(Default::default()).insert(docid); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 430313fbd..7688f29da 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -1,17 +1,17 @@ use std::cmp::Ordering; -use std::sync::RwLock; +use std::sync::{OnceLock, RwLock}; use std::thread::{self, Builder}; use big_s::S; -use document_changes::{ - for_each_document_change, DocumentChanges, FullySend, IndexingContext, ThreadLocal, -}; +use document_changes::{extract, DocumentChanges, IndexingContext, Progress, ThreadLocal}; pub use document_deletion::DocumentDeletion; pub use document_operation::DocumentOperation; +use hashbrown::HashMap; use heed::types::{Bytes, DecodeIgnore, Str}; use heed::{RoTxn, RwTxn}; use itertools::{merge_join_by, EitherOrBoth}; pub use partial_dump::PartialDump; +use rand::SeedableRng as _; use rayon::ThreadPool; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; @@ -19,37 +19,100 @@ pub use update_by_function::UpdateByFunction; use super::channel::*; use super::extract::*; use super::facet_search_builder::FacetSearchBuilder; -use super::merger::{FacetDatabases, FacetFieldIdsDelta}; -use super::word_fst_builder::PrefixDelta; +use super::merger::FacetFieldIdsDelta; +use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder}; use super::words_prefix_docids::{ compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, }; use super::{StdResult, TopLevelMap}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::facet::FacetType; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; use crate::proximity::ProximityPrecision; use crate::update::del_add::DelAdd; -use crate::update::new::word_fst_builder::{PrefixData, WordFstBuilder}; +use crate::update::new::extract::EmbeddingExtractor; use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; -use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids}; +use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; +use crate::vector::{ArroyWrapper, EmbeddingConfigs, Embeddings}; +use crate::{ + FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, + ThreadPoolNoAbortBuilder, UserError, +}; -pub mod de; +pub(crate) mod de; pub mod document_changes; mod document_deletion; mod document_operation; mod partial_dump; mod update_by_function; +mod steps { + pub const STEPS: &[&str] = &[ + "extracting documents", + "extracting facets", + "extracting words", + "extracting word proximity", + "extracting embeddings", + "writing to database", + "post-processing facets", + "post-processing words", + "finalizing", + ]; + + const fn step(step: u16) -> (u16, &'static str) { + (step, STEPS[step as usize]) + } + + pub const fn total_steps() -> u16 { + STEPS.len() as u16 + } + + pub const fn extract_documents() -> (u16, &'static str) { + step(0) + } + + pub const fn extract_facets() -> (u16, &'static str) { + step(1) + } + + pub const fn extract_words() -> (u16, &'static str) { + step(2) + } + + pub const fn extract_word_proximity() -> (u16, &'static str) { + step(3) + } + + pub const fn extract_embeddings() -> (u16, &'static str) { + step(4) + } + + pub const fn write_db() -> (u16, &'static str) { + step(5) + } + + pub const fn post_processing_facets() -> (u16, &'static str) { + step(6) + } + pub const fn post_processing_words() -> (u16, &'static str) { + step(7) + } + + pub const fn finalizing() -> (u16, &'static str) { + step(8) + } +} + /// This is the main function of this crate. /// /// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. /// /// TODO return stats -pub fn index<'pl, 'indexer, 'index, DC>( +#[allow(clippy::too_many_arguments)] // clippy: 😝 +pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( wtxn: &mut RwTxn, index: &'index Index, db_fields_ids_map: &'indexer FieldsIdsMap, @@ -57,15 +120,23 @@ pub fn index<'pl, 'indexer, 'index, DC>( new_primary_key: Option>, pool: &ThreadPool, document_changes: &DC, + embedders: EmbeddingConfigs, + must_stop_processing: &'indexer MSP, + send_progress: &'indexer SP, ) -> Result<()> where DC: DocumentChanges<'pl>, + MSP: Fn() -> bool + Sync, + SP: Fn(Progress) + Sync, { - // TODO find a better channel limit let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000); + + let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; + + let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); + let new_fields_ids_map = RwLock::new(new_fields_ids_map); - let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads()); let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); @@ -76,46 +147,69 @@ where new_fields_ids_map: &new_fields_ids_map, doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, + must_stop_processing, + send_progress, }; - thread::scope(|s| -> crate::Result<_> { + let total_steps = steps::total_steps(); + + let mut field_distribution = index.field_distribution(wtxn)?; + let mut document_ids = index.documents_ids(wtxn)?; + + thread::scope(|s| -> Result<()> { let indexer_span = tracing::Span::current(); + let embedders = &embedders; + // prevent moving the field_distribution and document_ids in the inner closure... + let field_distribution = &mut field_distribution; + let document_ids = &mut document_ids; // TODO manage the errors correctly let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { - let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); - let _entered = span.enter(); + let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); + let _entered = span.enter(); - // document but we need to create a function that collects and compresses documents. - let rtxn = index.read_txn().unwrap(); - let document_sender = extractor_sender.documents(); - let document_extractor = DocumentsExtractor::new(&document_sender); - let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); - for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?; + let rtxn = index.read_txn()?; - let mut documents_ids = index.documents_ids(&rtxn)?; - let delta_documents_ids = datastore.into_iter().map(|FullySend(d)| d.into_inner()).reduce(DelAddRoaringBitmap::merge).unwrap_or_default(); - delta_documents_ids.apply_to(&mut documents_ids); - extractor_sender.send_documents_ids(documents_ids).unwrap(); + // document but we need to create a function that collects and compresses documents. + let document_sender = extractor_sender.documents(); + let document_extractor = DocumentsExtractor::new(&document_sender, embedders); + let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); + let (finished_steps, step_name) = steps::extract_documents(); + extract(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; - // document_sender.finish().unwrap(); - const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; - let current_num_threads = rayon::current_num_threads(); - let max_memory = TEN_GIB / current_num_threads; - eprintln!("A maximum of {max_memory} bytes will be used for each of the {current_num_threads} threads"); - let grenad_parameters = GrenadParameters { - max_memory: Some(max_memory), - ..GrenadParameters::default() - }; + for document_extractor_data in datastore { + let document_extractor_data = document_extractor_data.0.into_inner(); + for (field, delta) in document_extractor_data.field_distribution_delta { + let current = field_distribution.entry(field).or_default(); + // adding the delta should never cause a negative result, as we are removing fields that previously existed. + *current = current.saturating_add_signed(delta); + } + document_extractor_data.docids_delta.apply_to(document_ids); + } - let facet_field_ids_delta; + field_distribution.retain(|_, v| *v == 0); + + const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; + let current_num_threads = rayon::current_num_threads(); + let max_memory = TEN_GIB / current_num_threads; + eprintln!("A maximum of {max_memory} bytes will be used for each of the {current_num_threads} threads"); + + let grenad_parameters = GrenadParameters { + max_memory: Some(max_memory), + ..GrenadParameters::default() + }; + + let facet_field_ids_delta; { let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); let _entered = span.enter(); + + let (finished_steps, step_name) = steps::extract_facets(); + facet_field_ids_delta = merge_and_send_facet_docids( - FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?, + FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?, FacetDatabases::new(index), index, extractor_sender.facet_docids(), @@ -125,6 +219,7 @@ where { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); let _entered = span.enter(); + let (finished_steps, step_name) = steps::extract_words(); let WordDocidsCaches { word_docids, @@ -132,7 +227,7 @@ where exact_word_docids, word_position_docids, fid_word_count_docids, - } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?; + } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?; // TODO Word Docids Merger // extractor_sender.send_searchable::(word_docids).unwrap(); @@ -206,7 +301,10 @@ where if proximity_precision == ProximityPrecision::ByWord { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); let _entered = span.enter(); - let caches = ::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?; + + let (finished_steps, step_name) = steps::extract_word_proximity(); + + let caches = ::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?; merge_and_send_docids( caches, index.word_pair_proximity_docids.remap_types(), @@ -215,62 +313,212 @@ where )?; } - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); - let _entered = span.enter(); - } + 'vectors: { + let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); + let _entered = span.enter(); - // TODO THIS IS TOO MUCH - // - [ ] Extract fieldid docid facet number - // - [ ] Extract fieldid docid facet string - // - [ ] Extract facetid string fst - // - [ ] Extract facetid normalized string strings + let index_embeddings = index.embedding_configs(&rtxn)?; + if index_embeddings.is_empty() { + break 'vectors; + } - // TODO Inverted Indexes again - // - [x] Extract fieldid facet isempty docids - // - [x] Extract fieldid facet isnull docids - // - [x] Extract fieldid facet exists docids + let embedding_sender = extractor_sender.embeddings(); + let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); + let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); + let (finished_steps, step_name) = steps::extract_embeddings(); - // TODO This is the normal system - // - [x] Extract fieldid facet number docids - // - [x] Extract fieldid facet string docids - // TODO use None when needed - Result::Ok(facet_field_ids_delta) - }) + extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; + + + let mut user_provided = HashMap::new(); + for data in datastore { + let data = data.into_inner().0; + for (embedder, deladd) in data.into_iter() { + let user_provided = user_provided.entry(embedder).or_insert(Default::default()); + if let Some(del) = deladd.del { + *user_provided -= del; + } + if let Some(add) = deladd.add { + *user_provided |= add; + } + } + } + + embedding_sender.finish(user_provided).unwrap(); + } + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); + let _entered = span.enter(); + let (finished_steps, step_name) = steps::write_db(); + (indexing_context.send_progress)(Progress { finished_steps, total_steps, step_name, finished_total_documents: None }); + } + + // TODO THIS IS TOO MUCH + // - [ ] Extract fieldid docid facet number + // - [ ] Extract fieldid docid facet string + // - [ ] Extract facetid string fst + // - [ ] Extract facetid normalized string strings + + // TODO Inverted Indexes again + // - [x] Extract fieldid facet isempty docids + // - [x] Extract fieldid facet isnull docids + // - [x] Extract fieldid facet exists docids + + // TODO This is the normal system + // - [x] Extract fieldid facet number docids + // - [x] Extract fieldid facet string docids + + Result::Ok(facet_field_ids_delta) + }) })?; + let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); + + let indexer_span = tracing::Span::current(); + + let vector_arroy = index.vector_arroy; + let mut rng = rand::rngs::StdRng::seed_from_u64(42); + let indexer_span = tracing::Span::current(); + let arroy_writers: Result> = embedders + .inner_as_ref() + .iter() + .map(|(embedder_name, (embedder, _, was_quantized))| { + let embedder_index = index.embedder_category_id.get(wtxn, embedder_name)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + }, + )?; + + let dimensions = embedder.dimensions(); + let writer = ArroyWrapper::new(vector_arroy, embedder_index, *was_quantized); + + Ok(( + embedder_index, + (embedder_name.as_str(), embedder.as_ref(), writer, dimensions), + )) + }) + .collect(); + + let mut arroy_writers = arroy_writers?; for operation in writer_receiver { - let database = operation.database(index); - match operation.entry() { - EntryOperation::Delete(e) => { - if !database.delete(wtxn, e.entry())? { - unreachable!("We tried to delete an unknown key") + match operation { + WriterOperation::DbOperation(db_operation) => { + let database = db_operation.database(index); + match db_operation.entry() { + EntryOperation::Delete(e) => { + if !database.delete(wtxn, e.entry())? { + unreachable!("We tried to delete an unknown key") + } + } + EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, } } - EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, + WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { + ArroyOperation::DeleteVectors { docid } => { + for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in + &mut arroy_writers + { + let dimensions = *dimensions; + writer.del_items(wtxn, dimensions, docid)?; + } + } + ArroyOperation::SetVectors { + docid, + embedder_id, + embeddings: raw_embeddings, + } => { + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + // TODO: switch to Embeddings + let mut embeddings = Embeddings::new(*dimensions); + for embedding in raw_embeddings { + embeddings.append(embedding).unwrap(); + } + + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_items(wtxn, docid, &embeddings)?; + } + ArroyOperation::SetVector { docid, embedder_id, embedding } => { + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_item(wtxn, docid, &embedding)?; + } + ArroyOperation::Finish { mut user_provided } => { + let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); + let _entered = span.enter(); + for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in + &mut arroy_writers + { + let dimensions = *dimensions; + writer.build_and_quantize( + wtxn, + &mut rng, + dimensions, + false, + &indexing_context.must_stop_processing, + )?; + } + + let mut configs = index.embedding_configs(wtxn)?; + + for config in &mut configs { + if let Some(user_provided) = user_provided.remove(&config.name) { + config.user_provided = user_provided; + } + } + + index.put_embedding_configs(wtxn, configs)?; + } + }, } } - /// TODO handle the panicking threads let facet_field_ids_delta = extractor_handle.join().unwrap()?; + let (finished_steps, step_name) = steps::post_processing_facets(); + (indexing_context.send_progress)(Progress { + finished_steps, + total_steps, + step_name, + finished_total_documents: None, + }); + + compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; + + compute_facet_search_database(index, wtxn, global_fields_ids_map)?; + + let (finished_steps, step_name) = steps::post_processing_words(); + (indexing_context.send_progress)(Progress { + finished_steps, + total_steps, + step_name, + finished_total_documents: None, + }); + if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { compute_prefix_database(index, wtxn, prefix_delta)?; } - compute_facet_search_database(index, wtxn, global_fields_ids_map)?; + let (finished_steps, step_name) = steps::finalizing(); + (indexing_context.send_progress)(Progress { + finished_steps, + total_steps, + step_name, + finished_total_documents: None, + }); - compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; - - Result::Ok(()) + Ok(()) as Result<_> })?; // required to into_inner the new_fields_ids_map drop(fields_ids_map_store); - let fields_ids_map = new_fields_ids_map.into_inner().unwrap(); - index.put_fields_ids_map(wtxn, &fields_ids_map)?; + let new_fields_ids_map = new_fields_ids_map.into_inner().unwrap(); + index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?; if let Some(new_primary_key) = new_primary_key { index.put_primary_key(wtxn, new_primary_key.name())?; @@ -280,7 +528,8 @@ where let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn)?; inner_index_settings.recompute_facets(wtxn, index)?; inner_index_settings.recompute_searchables(wtxn, index)?; - + index.put_field_distribution(wtxn, &field_distribution)?; + index.put_documents_ids(wtxn, &document_ids)?; index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; Ok(()) @@ -517,3 +766,15 @@ pub fn retrieve_or_guess_primary_key<'a>( Err(err) => Ok(Err(err)), } } + +fn request_threads() -> &'static ThreadPoolNoAbort { + static REQUEST_THREADS: OnceLock = OnceLock::new(); + + REQUEST_THREADS.get_or_init(|| { + ThreadPoolNoAbortBuilder::new() + .num_threads(crate::vector::REQUEST_PARALLELISM) + .thread_name(|index| format!("embedding-request-{index}")) + .build() + .unwrap() + }) +} diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 6796134db..dc73c5268 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -14,7 +14,7 @@ use crate::index::IndexEmbeddingConfig; use crate::vector::parsed_vectors::{ RawVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; -use crate::vector::{Embedding, EmbeddingConfigs}; +use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs}; use crate::{DocumentId, Index, InternalError, Result, UserError}; #[derive(Serialize)] @@ -117,16 +117,10 @@ impl<'t> VectorDocumentFromDb<'t> { embedder_id: u8, config: &IndexEmbeddingConfig, ) -> Result> { - let readers = self.index.arroy_readers(self.rtxn, embedder_id, config.config.quantized()); - let mut vectors = Vec::new(); - for reader in readers { - let reader = reader?; - let Some(vector) = reader.item_vector(self.rtxn, self.docid)? else { - break; - }; + let reader = + ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized()); + let vectors = reader.item_vectors(self.rtxn, self.docid)?; - vectors.push(vector); - } Ok(VectorEntry { has_configured_embedder: true, embeddings: Some(Embeddings::FromDb(vectors)), diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index d5e0697d6..41765f6ab 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -454,7 +454,7 @@ impl UnusedVectorsDistribution { } pub struct UnusedVectorsDistributionBump<'doc>( - hashbrown::HashMap<&'doc str, u64, hashbrown::hash_map::DefaultHashBuilder, &'doc Bump>, + hashbrown::HashMap<&'doc str, u64, hashbrown::DefaultHashBuilder, &'doc Bump>, ); impl<'doc> UnusedVectorsDistributionBump<'doc> { diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs deleted file mode 100644 index 3bee9904f..000000000 --- a/milli/src/update/new/indexer/mod.rs +++ /dev/null @@ -1,823 +0,0 @@ -use std::cell::RefCell; -use std::sync::{OnceLock, RwLock}; -use std::thread::{self, Builder}; - -use big_s::S; -use bumpalo::Bump; -use document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, - Progress, RefCellExt, ThreadLocal, -}; -pub use document_deletion::DocumentDeletion; -pub use document_operation::DocumentOperation; -use hashbrown::HashMap; -use heed::{RoTxn, RwTxn}; -use itertools::{EitherOrBoth, Itertools}; -pub use partial_dump::PartialDump; -use rand::SeedableRng as _; -use rayon::ThreadPool; -use time::OffsetDateTime; -pub use update_by_function::UpdateByFunction; - -use super::channel::*; -use super::document::{write_to_obkv, Document}; -use super::document_change::DocumentChange; -use super::extract::*; -use super::merger::{merge_grenad_entries, FacetFieldIdsDelta}; -use super::word_fst_builder::PrefixDelta; -use super::words_prefix_docids::{ - compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, -}; -use super::{StdResult, TopLevelMap}; -use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; -use crate::facet::FacetType; -use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; -use crate::proximity::ProximityPrecision; -use crate::update::new::channel::ExtractorSender; -use crate::update::new::extract::EmbeddingExtractor; -use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; -use crate::update::settings::InnerIndexSettings; -use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::vector::{ArroyWrapper, EmbeddingConfigs}; -use crate::{ - FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, - ThreadPoolNoAbortBuilder, UserError, -}; - -pub(crate) mod de; -pub mod document_changes; -mod document_deletion; -mod document_operation; -mod partial_dump; -mod update_by_function; - -struct DocumentExtractor<'a> { - document_sender: &'a DocumentSender<'a>, - embedders: &'a EmbeddingConfigs, -} - -impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { - type Data = FullySend>>; - - fn init_data( - &self, - _extractor_alloc: raw_collections::alloc::RefBump<'extractor>, - ) -> Result { - Ok(FullySend(Default::default())) - } - - fn process<'doc>( - &self, - changes: impl Iterator>>, - context: &DocumentChangeContext, - ) -> Result<()> { - let mut document_buffer = Vec::new(); - let mut field_distribution_delta = context.data.0.borrow_mut_or_yield(); - - let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); - - for change in changes { - let change = change?; - let external_docid = change.external_docid().to_owned(); - - // document but we need to create a function that collects and compresses documents. - match change { - DocumentChange::Deletion(deletion) => { - let docid = deletion.docid(); - let content = deletion.current( - &context.txn, - context.index, - &context.db_fields_ids_map, - )?; - for res in content.iter_top_level_fields() { - let (f, _) = res?; - let entry = field_distribution_delta.entry_ref(f).or_default(); - *entry -= 1; - } - self.document_sender.delete(docid, external_docid).unwrap(); - } - DocumentChange::Update(update) => { - let docid = update.docid(); - let content = - update.current(&context.txn, context.index, &context.db_fields_ids_map)?; - for res in content.iter_top_level_fields() { - let (f, _) = res?; - let entry = field_distribution_delta.entry_ref(f).or_default(); - *entry -= 1; - } - let content = update.updated(); - for res in content.iter_top_level_fields() { - let (f, _) = res?; - let entry = field_distribution_delta.entry_ref(f).or_default(); - *entry += 1; - } - - let content = - update.merged(&context.txn, context.index, &context.db_fields_ids_map)?; - let vector_content = update.merged_vectors( - &context.txn, - context.index, - &context.db_fields_ids_map, - &context.doc_alloc, - self.embedders, - )?; - let content = write_to_obkv( - &content, - vector_content.as_ref(), - &mut new_fields_ids_map, - &mut document_buffer, - )?; - self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); - } - DocumentChange::Insertion(insertion) => { - let docid = insertion.docid(); - let content = insertion.inserted(); - for res in content.iter_top_level_fields() { - let (f, _) = res?; - let entry = field_distribution_delta.entry_ref(f).or_default(); - *entry += 1; - } - let inserted_vectors = - insertion.inserted_vectors(&context.doc_alloc, self.embedders)?; - let content = write_to_obkv( - &content, - inserted_vectors.as_ref(), - &mut new_fields_ids_map, - &mut document_buffer, - )?; - self.document_sender.insert(docid, external_docid, content.boxed()).unwrap(); - // extracted_dictionary_sender.send(self, dictionary: &[u8]); - } - } - } - - Ok(()) - } -} - -mod steps { - pub const STEPS: &[&str] = &[ - "extracting documents", - "extracting facets", - "extracting words", - "extracting word proximity", - "extracting embeddings", - "writing to database", - "post-processing facets", - "post-processing words", - "finalizing", - ]; - - const fn step(step: u16) -> (u16, &'static str) { - (step, STEPS[step as usize]) - } - - pub const fn total_steps() -> u16 { - STEPS.len() as u16 - } - - pub const fn extract_documents() -> (u16, &'static str) { - step(0) - } - - pub const fn extract_facets() -> (u16, &'static str) { - step(1) - } - - pub const fn extract_words() -> (u16, &'static str) { - step(2) - } - - pub const fn extract_word_proximity() -> (u16, &'static str) { - step(3) - } - - pub const fn extract_embeddings() -> (u16, &'static str) { - step(4) - } - - pub const fn write_db() -> (u16, &'static str) { - step(5) - } - - pub const fn post_processing_facets() -> (u16, &'static str) { - step(6) - } - pub const fn post_processing_words() -> (u16, &'static str) { - step(7) - } - - pub const fn finalizing() -> (u16, &'static str) { - step(8) - } -} - -/// This is the main function of this crate. -/// -/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. -/// -/// TODO return stats -#[allow(clippy::too_many_arguments)] // clippy: 😝 -pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( - wtxn: &mut RwTxn, - index: &'index Index, - db_fields_ids_map: &'indexer FieldsIdsMap, - new_fields_ids_map: FieldsIdsMap, - new_primary_key: Option>, - pool: &ThreadPool, - document_changes: &DC, - embedders: EmbeddingConfigs, - must_stop_processing: &'indexer MSP, - send_progress: &'indexer SP, -) -> Result<()> -where - DC: DocumentChanges<'pl>, - MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, -{ - let (merger_sender, writer_receiver) = merger_writer_channel(10_000); - // This channel acts as a rendezvous point to ensure that we are one task ahead - let (extractor_sender, merger_receiver) = extractors_merger_channels(4); - - let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; - - let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); - - let new_fields_ids_map = RwLock::new(new_fields_ids_map); - - let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads()); - let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); - let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); - - let indexing_context = IndexingContext { - index, - db_fields_ids_map, - new_fields_ids_map: &new_fields_ids_map, - doc_allocs: &doc_allocs, - fields_ids_map_store: &fields_ids_map_store, - must_stop_processing, - send_progress, - }; - - let total_steps = steps::total_steps(); - - let mut field_distribution = index.field_distribution(wtxn)?; - - thread::scope(|s| -> Result<()> { - let indexer_span = tracing::Span::current(); - let embedders = &embedders; - // prevent moving the field_distribution in the inner closure... - let field_distribution = &mut field_distribution; - // TODO manage the errors correctly - let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { - pool.in_place_scope(|_s| { - let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); - let _entered = span.enter(); - - // document but we need to create a function that collects and compresses documents. - let document_sender = extractor_sender.document_sender(); - let document_extractor = DocumentExtractor { document_sender: &document_sender, embedders }; - let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); - let (finished_steps, step_name) = steps::extract_documents(); - extract(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; - - for field_distribution_delta in datastore { - let field_distribution_delta = field_distribution_delta.0.into_inner(); - for (field, delta) in field_distribution_delta { - let current = field_distribution.entry(field).or_default(); - // adding the delta should never cause a negative result, as we are removing fields that previously existed. - *current = current.saturating_add_signed(delta); - } - } - - field_distribution.retain(|_, v| *v == 0); - - document_sender.finish().unwrap(); - - const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; - let max_memory = TEN_GIB / dbg!(rayon::current_num_threads()); - let grenad_parameters = GrenadParameters { - max_memory: Some(max_memory), - ..GrenadParameters::default() - }; - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); - let _entered = span.enter(); - let (finished_steps, step_name) = steps::extract_facets(); - extract_and_send_docids::< - _, - FacetedDocidsExtractor, - FacetDocids, - _, - _ - >( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - &extractor_sender, - finished_steps, - total_steps, - step_name - )?; - } - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); - let _entered = span.enter(); - let (finished_steps, step_name) = steps::extract_words(); - - let WordDocidsMergers { - word_fid_docids, - word_docids, - exact_word_docids, - word_position_docids, - fid_word_count_docids, - } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?; - extractor_sender.send_searchable::(word_docids).unwrap(); - extractor_sender.send_searchable::(word_fid_docids).unwrap(); - extractor_sender.send_searchable::(exact_word_docids).unwrap(); - extractor_sender.send_searchable::(word_position_docids).unwrap(); - extractor_sender.send_searchable::(fid_word_count_docids).unwrap(); - } - - // run the proximity extraction only if the precision is by word - // this works only if the settings didn't change during this transaction. - let rtxn = index.read_txn().unwrap(); - let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); - if proximity_precision == ProximityPrecision::ByWord { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); - let _entered = span.enter(); - let (finished_steps, step_name) = steps::extract_word_proximity(); - - - extract_and_send_docids::< - _, - WordPairProximityDocidsExtractor, - WordPairProximityDocids, - _, - _ - >( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - &extractor_sender, - finished_steps, - total_steps, - step_name, - )?; - } - - 'vectors: { - let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); - let _entered = span.enter(); - - let index_embeddings = index.embedding_configs(&rtxn)?; - if index_embeddings.is_empty() { - break 'vectors; - } - /// FIXME: need access to `merger_sender` - let embedding_sender = todo!(); - let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, &field_distribution, request_threads()); - let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); - let (finished_steps, step_name) = steps::extract_embeddings(); - - - extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; - - - let mut user_provided = HashMap::new(); - for data in datastore { - let data = data.0.into_inner(); - for (embedder, deladd) in data.into_iter() { - let user_provided = user_provided.entry(embedder).or_insert(Default::default()); - if let Some(del) = deladd.del { - *user_provided -= del; - } - if let Some(add) = deladd.add { - *user_provided |= add; - } - } - } - - embedding_sender.finish(user_provided).unwrap(); - } - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); - let _entered = span.enter(); - let (finished_steps, step_name) = steps::write_db(); - (indexing_context.send_progress)(Progress { finished_steps, total_steps, step_name, finished_total_documents: None }); - } - - // TODO THIS IS TOO MUCH - // - [ ] Extract fieldid docid facet number - // - [ ] Extract fieldid docid facet string - // - [ ] Extract facetid string fst - // - [ ] Extract facetid normalized string strings - - // TODO Inverted Indexes again - // - [x] Extract fieldid facet isempty docids - // - [x] Extract fieldid facet isnull docids - // - [x] Extract fieldid facet exists docids - - // TODO This is the normal system - // - [x] Extract fieldid facet number docids - // - [x] Extract fieldid facet string docids - - Ok(()) as Result<_> - }) - })?; - - let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); - - let indexer_span = tracing::Span::current(); - // TODO manage the errors correctly - let merger_thread = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || { - let span = - tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "merge"); - let _entered = span.enter(); - let rtxn = index.read_txn().unwrap(); - merge_grenad_entries( - merger_receiver, - merger_sender, - &rtxn, - index, - global_fields_ids_map, - ) - })?; - - let vector_arroy = index.vector_arroy; - let mut rng = rand::rngs::StdRng::seed_from_u64(42); - let indexer_span = tracing::Span::current(); - let arroy_writers: Result> = embedders - .inner_as_ref() - .iter() - .map(|(embedder_name, (embedder, _, was_quantized))| { - let embedder_index = index.embedder_category_id.get(wtxn, embedder_name)?.ok_or( - InternalError::DatabaseMissingEntry { - db_name: "embedder_category_id", - key: None, - }, - )?; - - let dimensions = embedder.dimensions(); - - let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index) - .map(|k| ArroyWrapper::new(vector_arroy, k, *was_quantized)) - .collect(); - - Ok(( - embedder_index, - (embedder_name.as_str(), embedder.as_ref(), writers, dimensions), - )) - }) - .collect(); - - let mut arroy_writers = arroy_writers?; - for operation in writer_receiver { - match operation { - WriterOperation::DbOperation(db_operation) => { - let database = db_operation.database(index); - match db_operation.entry() { - EntryOperation::Delete(e) => { - if !database.delete(wtxn, e.entry())? { - unreachable!("We tried to delete an unknown key") - } - } - EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, - } - } - WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { - ArroyOperation::DeleteVectors { docid } => { - for (_embedder_index, (_embedder_name, _embedder, writers, dimensions)) in - &mut arroy_writers - { - let dimensions = *dimensions; - for writer in writers { - // Uses invariant: vectors are packed in the first writers. - if !writer.del_item(wtxn, dimensions, docid)? { - break; - } - } - } - } - ArroyOperation::SetVectors { docid, embedder_id, embeddings } => { - let (_, _, writers, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - for res in writers.iter().zip_longest(&embeddings) { - match res { - EitherOrBoth::Both(writer, embedding) => { - writer.add_item(wtxn, *dimensions, docid, embedding)?; - } - EitherOrBoth::Left(writer) => { - let deleted = writer.del_item(wtxn, *dimensions, docid)?; - if !deleted { - break; - } - } - EitherOrBoth::Right(_embedding) => { - let external_document_id = index - .external_id_of(wtxn, std::iter::once(docid))? - .into_iter() - .next() - .unwrap()?; - return Err(UserError::TooManyVectors( - external_document_id, - embeddings.len(), - ) - .into()); - } - } - } - } - ArroyOperation::SetVector { docid, embedder_id, embedding } => { - let (_, _, writers, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - for res in writers.iter().zip_longest(std::iter::once(&embedding)) { - match res { - EitherOrBoth::Both(writer, embedding) => { - writer.add_item(wtxn, *dimensions, docid, embedding)?; - } - EitherOrBoth::Left(writer) => { - let deleted = writer.del_item(wtxn, *dimensions, docid)?; - if !deleted { - break; - } - } - EitherOrBoth::Right(_embedding) => { - unreachable!("1 vs 256 vectors") - } - } - } - } - ArroyOperation::Finish { mut user_provided } => { - let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); - let _entered = span.enter(); - for (_embedder_index, (_embedder_name, _embedder, writers, dimensions)) in - &mut arroy_writers - { - let dimensions = *dimensions; - for writer in writers { - if writer.need_build(wtxn, dimensions)? { - writer.build(wtxn, &mut rng, dimensions)?; - } else if writer.is_empty(wtxn, dimensions)? { - break; - } - } - } - - let mut configs = index.embedding_configs(wtxn)?; - - for config in &mut configs { - if let Some(user_provided) = user_provided.remove(&config.name) { - config.user_provided = user_provided; - } - } - - index.put_embedding_configs(wtxn, configs)?; - } - }, - } - } - - /// TODO handle the panicking threads - handle.join().unwrap()?; - let merger_result = merger_thread.join().unwrap()?; - let (finished_steps, step_name) = steps::post_processing_facets(); - (indexing_context.send_progress)(Progress { - finished_steps, - total_steps, - step_name, - finished_total_documents: None, - }); - - if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta { - compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; - } - - let (finished_steps, step_name) = steps::post_processing_words(); - (indexing_context.send_progress)(Progress { - finished_steps, - total_steps, - step_name, - finished_total_documents: None, - }); - - if let Some(prefix_delta) = merger_result.prefix_delta { - compute_prefix_database(index, wtxn, prefix_delta)?; - } - - let (finished_steps, step_name) = steps::finalizing(); - (indexing_context.send_progress)(Progress { - finished_steps, - total_steps, - step_name, - finished_total_documents: None, - }); - - Ok(()) as Result<_> - })?; - - // required to into_inner the new_fields_ids_map - drop(fields_ids_map_store); - - let new_fields_ids_map = new_fields_ids_map.into_inner().unwrap(); - index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?; - - if let Some(new_primary_key) = new_primary_key { - index.put_primary_key(wtxn, new_primary_key.name())?; - } - - // used to update the localized and weighted maps while sharing the update code with the settings pipeline. - let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn)?; - inner_index_settings.recompute_facets(wtxn, index)?; - inner_index_settings.recompute_searchables(wtxn, index)?; - index.put_field_distribution(wtxn, &field_distribution)?; - index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - - Ok(()) -} - -#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] -fn compute_prefix_database( - index: &Index, - wtxn: &mut RwTxn, - prefix_delta: PrefixDelta, -) -> Result<()> { - eprintln!("prefix_delta: {:?}", &prefix_delta); - let PrefixDelta { modified, deleted } = prefix_delta; - // Compute word prefix docids - compute_word_prefix_docids(wtxn, index, &modified, &deleted)?; - // Compute exact word prefix docids - compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted)?; - // Compute word prefix fid docids - compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted)?; - // Compute word prefix position docids - compute_word_prefix_position_docids(wtxn, index, &modified, &deleted) -} - -#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_field_ids")] -fn compute_facet_level_database( - index: &Index, - wtxn: &mut RwTxn, - facet_field_ids_delta: FacetFieldIdsDelta, -) -> Result<()> { - eprintln!("facet_field_ids_delta: {:?}", &facet_field_ids_delta); - if let Some(modified_facet_string_ids) = facet_field_ids_delta.modified_facet_string_ids() { - let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string"); - let _entered = span.enter(); - FacetsUpdateBulk::new_not_updating_level_0( - index, - modified_facet_string_ids, - FacetType::String, - ) - .execute(wtxn)?; - } - if let Some(modified_facet_number_ids) = facet_field_ids_delta.modified_facet_number_ids() { - let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number"); - let _entered = span.enter(); - FacetsUpdateBulk::new_not_updating_level_0( - index, - modified_facet_number_ids, - FacetType::Number, - ) - .execute(wtxn)?; - } - - Ok(()) -} - -/// TODO: GrenadParameters::default() should be removed in favor a passed parameter -/// TODO: manage the errors correctly -/// TODO: we must have a single trait that also gives the extractor type -#[allow(clippy::too_many_arguments)] -fn extract_and_send_docids< - 'pl, - 'fid, - 'indexer, - 'index, - DC: DocumentChanges<'pl>, - E: DocidsExtractor, - D: MergerOperationType, - MSP, - SP, ->( - grenad_parameters: GrenadParameters, - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, - extractor_allocs: &mut ThreadLocal>>, - sender: &ExtractorSender, - finished_steps: u16, - total_steps: u16, - step_name: &'static str, -) -> Result<()> -where - MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, -{ - let merger = E::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - extractor_allocs, - finished_steps, - total_steps, - step_name, - )?; - sender.send_searchable::(merger).unwrap(); - Ok(()) -} - -/// Returns the primary key that has already been set for this index or the -/// one we will guess by searching for the first key that contains "id" as a substring, -/// and whether the primary key changed -/// TODO move this elsewhere -pub fn retrieve_or_guess_primary_key<'a>( - rtxn: &'a RoTxn<'a>, - index: &Index, - new_fields_ids_map: &mut FieldsIdsMap, - primary_key_from_op: Option<&'a str>, - first_document: Option<&'a TopLevelMap<'a>>, -) -> Result, bool), UserError>> { - // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. - - // do we have an existing declared primary key? - let (primary_key, has_changed) = if let Some(primary_key_from_db) = index.primary_key(rtxn)? { - // did we request a primary key in the operation? - match primary_key_from_op { - // we did, and it is different from the DB one - Some(primary_key_from_op) if primary_key_from_op != primary_key_from_db => { - // is the index empty? - if index.number_of_documents(rtxn)? == 0 { - // change primary key - (primary_key_from_op, true) - } else { - return Ok(Err(UserError::PrimaryKeyCannotBeChanged( - primary_key_from_db.to_string(), - ))); - } - } - _ => (primary_key_from_db, false), - } - } else { - // no primary key in the DB => let's set one - // did we request a primary key in the operation? - let primary_key = if let Some(primary_key_from_op) = primary_key_from_op { - // set primary key from operation - primary_key_from_op - } else { - // guess primary key - let first_document = match first_document { - Some(document) => document, - // previous indexer when no pk is set + we send an empty payload => index_primary_key_no_candidate_found - None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), - }; - - let mut guesses: Vec<&str> = first_document - .keys() - .map(AsRef::as_ref) - .filter(|name| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) - .collect(); - - // sort the keys in lexicographical order, so that fields are always in the same order. - guesses.sort_unstable(); - - match guesses.as_slice() { - [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), - [name] => { - tracing::info!("Primary key was not specified in index. Inferred to '{name}'"); - *name - } - multiple => { - return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { - candidates: multiple - .iter() - .map(|candidate| candidate.to_string()) - .collect(), - })) - } - } - }; - (primary_key, true) - }; - - match PrimaryKey::new_or_insert(primary_key, new_fields_ids_map) { - Ok(primary_key) => Ok(Ok((primary_key, has_changed))), - Err(err) => Ok(Err(err)), - } -} - -fn request_threads() -> &'static ThreadPoolNoAbort { - static REQUEST_THREADS: OnceLock = OnceLock::new(); - - REQUEST_THREADS.get_or_init(|| { - ThreadPoolNoAbortBuilder::new() - .num_threads(crate::vector::REQUEST_PARALLELISM) - .thread_name(|index| format!("embedding-request-{index}")) - .build() - .unwrap() - }) -} From 03650e3217025207046f28874eebb444aa31db9a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 7 Nov 2024 09:39:46 +0100 Subject: [PATCH 191/247] Reverse order of computation --- crates/milli/src/update/new/indexer/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 7688f29da..016aaeae5 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -487,9 +487,8 @@ where finished_total_documents: None, }); - compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; - compute_facet_search_database(index, wtxn, global_fields_ids_map)?; + compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; let (finished_steps, step_name) = steps::post_processing_words(); (indexing_context.send_progress)(Progress { From e2138170adf9afd017ccea73a9bf939e3a20e931 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 7 Nov 2024 10:06:07 +0100 Subject: [PATCH 192/247] some warning fix --- crates/milli/src/update/new/extract/vectors/mod.rs | 5 +++-- crates/milli/src/update/new/indexer/de.rs | 2 +- crates/milli/src/update/new/indexer/partial_dump.rs | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index a5cf915e4..55121fb14 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -98,7 +98,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { let new_vectors = update.updated_vectors(&context.doc_alloc, self.embedders)?; if let Some(new_vectors) = &new_vectors { - unused_vectors_distribution.append(new_vectors); + unused_vectors_distribution.append(new_vectors)?; } for chunks in &mut all_chunks { @@ -187,7 +187,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { let new_vectors = insertion.inserted_vectors(&context.doc_alloc, self.embedders)?; if let Some(new_vectors) = &new_vectors { - unused_vectors_distribution.append(new_vectors); + unused_vectors_distribution.append(new_vectors)?; } for chunks in &mut all_chunks { @@ -343,6 +343,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { res } + #[allow(clippy::too_many_arguments)] pub fn embed_chunks( texts: &mut BVec<'a, &'a str>, ids: &mut BVec<'a, DocumentId>, diff --git a/crates/milli/src/update/new/indexer/de.rs b/crates/milli/src/update/new/indexer/de.rs index 94ab4c2c1..832aadd43 100644 --- a/crates/milli/src/update/new/indexer/de.rs +++ b/crates/milli/src/update/new/indexer/de.rs @@ -594,7 +594,7 @@ impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> { where A: serde::de::SeqAccess<'de>, { - let mut raw_vec = raw_collections::RawVec::new_in(&self.alloc); + let mut raw_vec = raw_collections::RawVec::new_in(self.alloc); while let Some(next) = seq.next_element()? { raw_vec.push(next); } diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index e58141af7..4984eb354 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -6,7 +6,7 @@ use serde_json::value::RawValue; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend, RefCellExt}; use crate::documents::PrimaryKey; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; -use crate::update::new::document::{DocumentFromVersions, Versions}; +use crate::update::new::document::Versions; use crate::update::new::{DocumentChange, Insertion}; use crate::{Error, InternalError, Result, UserError}; From 39366a67c40c000ecd436a68580a7d3523db6390 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 7 Nov 2024 10:39:58 +0100 Subject: [PATCH 193/247] Top level fields don't return vector fields --- crates/milli/src/update/new/document.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index 14e4f72e5..f43eb63e4 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -27,6 +27,9 @@ pub trait Document<'doc> { self.len() == 0 } + /// Get the **top-level** with the specified name, if exists. + /// + /// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning e.g. `top_level_field("_vectors")` will return `Ok(None)` fn top_level_field(&self, k: &str) -> Result>; /// Returns the unparsed value of the `_vectors` field from the document data. @@ -105,6 +108,9 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { } fn top_level_field(&self, k: &str) -> Result> { + if k == RESERVED_VECTORS_FIELD_NAME || k == "_geo" { + return Ok(None); + } self.field(k) } } @@ -393,6 +399,9 @@ impl<'doc> Versions<'doc> { self.data.is_empty() } pub fn top_level_field(&self, k: &str) -> Option<&'doc RawValue> { + if k == RESERVED_VECTORS_FIELD_NAME || k == "_geo" { + return None; + } self.data.get(k) } } From c9f478bc4585c72fdcbcdae3092a70c94fe596b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 6 Nov 2024 18:59:49 +0100 Subject: [PATCH 194/247] Fix bbbul merger --- Cargo.lock | 2 +- crates/milli/src/update/new/extract/cache.rs | 51 +++++++++----------- crates/milli/src/update/new/indexer/mod.rs | 2 - 3 files changed, 23 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c09c28d25..b0e5978b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4473,7 +4473,7 @@ dependencies = [ [[package]] name = "raw-collections" version = "0.1.0" -source = "git+https://github.com/dureuill/raw-collections.git#e04a52424e1124ca63df66338a79c628e8f3bfd7" +source = "git+https://github.com/dureuill/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a" dependencies = [ "allocator-api2", "bitpacking", diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 63590db69..dd43feefb 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -526,7 +526,7 @@ where for (map_index, map) in maps.iter_mut().enumerate() { if first_entry.source_index != map_index { if let Some(new) = map.get_mut(first_key) { - output.append_and_clear_bbbul(new); + output.union_and_clear_bbbul(new); } } } @@ -543,20 +543,22 @@ where // Then manage the content on the HashMap entries that weren't taken (mem::take). while let Some(mut map) = maps.pop() { for (key, bbbul) in map.iter_mut() { - let mut output = DelAddRoaringBitmap::empty(); - output.append_and_clear_bbbul(bbbul); - // Make sure we don't try to work with entries already managed by the spilled - if !bbbul.is_empty() { - for rhs in maps.iter_mut() { - if let Some(new) = rhs.get_mut(key) { - output.append_and_clear_bbbul(new); - } - } - - // We send the merged entry outside. - (f)(key, output)?; + if bbbul.is_empty() { + continue; } + + let mut output = DelAddRoaringBitmap::empty(); + output.union_and_clear_bbbul(bbbul); + + for rhs in maps.iter_mut() { + if let Some(new) = rhs.get_mut(key) { + output.union_and_clear_bbbul(new); + } + } + + // We send the merged entry outside. + (f)(key, output)?; } } @@ -596,14 +598,6 @@ pub struct DelAddBbbul<'bump, B> { } impl<'bump, B: BitPacker> DelAddBbbul<'bump, B> { - pub fn insert_del_u32_in(&mut self, n: u32, bump: &'bump Bump) { - self.del.get_or_insert_with(|| Bbbul::new_in(bump)).insert(n); - } - - pub fn insert_add_u32_in(&mut self, n: u32, bump: &'bump Bump) { - self.add.get_or_insert_with(|| Bbbul::new_in(bump)).insert(n); - } - pub fn new_del_u32_in(n: u32, bump: &'bump Bump) -> Self { let mut bbbul = Bbbul::new_in(bump); bbbul.insert(n); @@ -655,11 +649,6 @@ impl DelAddRoaringBitmap { DelAddRoaringBitmap { del: None, add: None } } - pub fn is_empty(&self) -> bool { - let DelAddRoaringBitmap { del, add } = self; - del.is_none() && add.is_none() - } - pub fn insert_del_u32(&mut self, n: u32) { self.del.get_or_insert_with(RoaringBitmap::new).insert(n); } @@ -676,14 +665,16 @@ impl DelAddRoaringBitmap { DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) } } - pub fn append_and_clear_bbbul(&mut self, bbbul: &mut FrozenDelAddBbbul<'_, B>) { + pub fn union_and_clear_bbbul(&mut self, bbbul: &mut FrozenDelAddBbbul<'_, B>) { let FrozenDelAddBbbul { del, add } = bbbul; if let Some(ref mut bbbul) = del.take() { let del = self.del.get_or_insert_with(RoaringBitmap::new); let mut iter = bbbul.iter_and_clear(); while let Some(block) = iter.next_block() { - del.append(block.iter().copied()); + let iter = block.iter().copied(); + let block = RoaringBitmap::from_sorted_iter(iter).unwrap(); + *del |= block; } } @@ -691,7 +682,9 @@ impl DelAddRoaringBitmap { let add = self.add.get_or_insert_with(RoaringBitmap::new); let mut iter = bbbul.iter_and_clear(); while let Some(block) = iter.next_block() { - add.append(block.iter().copied()); + let iter = block.iter().copied(); + let block = RoaringBitmap::from_sorted_iter(iter).unwrap(); + *add |= block; } } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 016aaeae5..0f83ff79f 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -589,10 +589,8 @@ fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?; if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data { - // extractor_sender.main().write_words_prefixes_fst(prefixes_fst_mmap).unwrap(); index.main.remap_types::().put( wtxn, WORDS_PREFIXES_FST_KEY, From 1477b81d38740178303fc3d0585bc548d9c15519 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 7 Nov 2024 11:23:49 +0100 Subject: [PATCH 195/247] Support cancelation in merge and send --- crates/milli/src/update/new/indexer/mod.rs | 30 +++++++++++++++++----- crates/milli/src/update/new/merger.rs | 16 +++++++++--- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 0f83ff79f..81596f3fe 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -57,6 +57,7 @@ mod steps { "extracting word proximity", "extracting embeddings", "writing to database", + "writing embeddings to database", "post-processing facets", "post-processing words", "finalizing", @@ -94,15 +95,19 @@ mod steps { step(5) } - pub const fn post_processing_facets() -> (u16, &'static str) { + pub const fn write_embedding_db() -> (u16, &'static str) { step(6) } - pub const fn post_processing_words() -> (u16, &'static str) { + + pub const fn post_processing_facets() -> (u16, &'static str) { step(7) } + pub const fn post_processing_words() -> (u16, &'static str) { + step(8) + } pub const fn finalizing() -> (u16, &'static str) { - step(8) + step(9) } } @@ -239,6 +244,7 @@ where index.word_docids.remap_types(), index, extractor_sender.docids::(), + &indexing_context.must_stop_processing, )?; } @@ -251,7 +257,8 @@ where word_fid_docids, index.word_fid_docids.remap_types(), index, - extractor_sender.docids::() + extractor_sender.docids::(), + &indexing_context.must_stop_processing, )?; } @@ -265,6 +272,7 @@ where index.exact_word_docids.remap_types(), index, extractor_sender.docids::(), + &indexing_context.must_stop_processing, )?; } @@ -278,6 +286,7 @@ where index.word_position_docids.remap_types(), index, extractor_sender.docids::(), + &indexing_context.must_stop_processing, )?; } @@ -291,6 +300,7 @@ where index.field_id_word_count_docids.remap_types(), index, extractor_sender.docids::(), + &indexing_context.must_stop_processing, )?; } } @@ -310,6 +320,7 @@ where index.word_pair_proximity_docids.remap_types(), index, extractor_sender.docids::(), + &indexing_context.must_stop_processing, )?; } @@ -376,8 +387,6 @@ where let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); - let indexer_span = tracing::Span::current(); - let vector_arroy = index.vector_arroy; let mut rng = rand::rngs::StdRng::seed_from_u64(42); let indexer_span = tracing::Span::current(); @@ -450,6 +459,15 @@ where ArroyOperation::Finish { mut user_provided } => { let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); let _entered = span.enter(); + + let (finished_steps, step_name) = steps::write_embedding_db(); + (indexing_context.send_progress)(Progress { + finished_steps, + total_steps, + step_name, + finished_total_documents: None, + }); + for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers { diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index b1c5c5fd9..4eca113ea 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -12,7 +12,10 @@ use super::extract::{ merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind, }; use super::DocumentChange; -use crate::{CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, Result}; +use crate::{ + CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, InternalError, + Result, +}; pub struct GeoExtractor { rtree: Option>, @@ -63,15 +66,22 @@ impl GeoExtractor { } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -pub fn merge_and_send_docids<'extractor>( +pub fn merge_and_send_docids<'extractor, MSP>( mut caches: Vec>, database: Database, index: &Index, docids_sender: impl DocidsSender + Sync, -) -> Result<()> { + must_stop_processing: &MSP, +) -> Result<()> +where + MSP: Fn() -> bool + Sync, +{ transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| { let rtxn = index.read_txn()?; let mut buffer = Vec::new(); + if must_stop_processing() { + return Err(InternalError::AbortedIndexation.into()); + } merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { From 0e4e9e866a2c0315cecebbfab601deceb5eb30d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 7 Nov 2024 11:36:09 +0100 Subject: [PATCH 196/247] Move the RefCellExt trait in a dedicated module --- .../milli/src/update/new/extract/documents.rs | 5 +- .../new/extract/faceted/extract_facets.rs | 3 +- .../extract/searchable/extract_word_docids.rs | 3 +- .../extract_word_pair_proximity_docids.rs | 3 +- .../update/new/indexer/document_changes.rs | 53 +------------------ .../src/update/new/indexer/partial_dump.rs | 3 +- .../update/new/indexer/update_by_function.rs | 3 +- crates/milli/src/update/new/mod.rs | 1 + crates/milli/src/update/new/ref_cell_ext.rs | 52 ++++++++++++++++++ .../src/update/new/words_prefix_docids.rs | 2 +- 10 files changed, 67 insertions(+), 61 deletions(-) create mode 100644 crates/milli/src/update/new/ref_cell_ext.rs diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 79e1a2462..2c93a5def 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -6,9 +6,8 @@ use hashbrown::HashMap; use super::DelAddRoaringBitmap; use crate::update::new::channel::DocumentsSender; use crate::update::new::document::{write_to_obkv, Document as _}; -use crate::update::new::indexer::document_changes::{ - DocumentChangeContext, Extractor, FullySend, RefCellExt as _, -}; +use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor, FullySend}; +use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::DocumentChange; use crate::vector::EmbeddingConfigs; use crate::Result; diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 1aaae1cb8..11dc8f3c7 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -13,8 +13,9 @@ use crate::facet::value_encoding::f64_into_bytes; use crate::update::new::extract::DocidsExtractor; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, - Progress, RefCellExt, ThreadLocal, + Progress, ThreadLocal, }; +use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index cadea7251..89583bd93 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -12,8 +12,9 @@ use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, - MostlySend, Progress, RefCellExt, ThreadLocal, + MostlySend, Progress, ThreadLocal, }; +use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 796b8c943..7f9fff38f 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -9,7 +9,8 @@ use super::SearchableExtractor; use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::update::new::document::Document; use crate::update::new::extract::cache::BalancedCaches; -use crate::update::new::indexer::document_changes::{DocumentChangeContext, RefCellExt}; +use crate::update::new::indexer::document_changes::DocumentChangeContext; +use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::DocumentChange; use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index 0a9155b1c..b9bf79e47 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -1,4 +1,4 @@ -use std::cell::{Cell, Ref, RefCell, RefMut}; +use std::cell::{Cell, RefCell}; use std::sync::{Arc, RwLock}; use bumpalo::Bump; @@ -10,57 +10,6 @@ use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result}; -pub trait RefCellExt { - fn try_borrow_or_yield(&self) -> std::result::Result, std::cell::BorrowError>; - fn try_borrow_mut_or_yield( - &self, - ) -> std::result::Result, std::cell::BorrowMutError>; - - fn borrow_or_yield(&self) -> Ref<'_, T> { - self.try_borrow_or_yield().unwrap() - } - - fn borrow_mut_or_yield(&self) -> RefMut<'_, T> { - self.try_borrow_mut_or_yield().unwrap() - } -} - -impl RefCellExt for RefCell { - fn try_borrow_or_yield(&self) -> std::result::Result, std::cell::BorrowError> { - /// TODO: move this trait and impl elsewhere - loop { - match self.try_borrow() { - Ok(borrow) => break Ok(borrow), - Err(error) => { - tracing::warn!("dynamic borrow failed, yielding to local tasks"); - match rayon::yield_local() { - Some(rayon::Yield::Executed) => continue, - _ => return Err(error), - } - } - } - } - } - - fn try_borrow_mut_or_yield( - &self, - ) -> std::result::Result, std::cell::BorrowMutError> { - loop { - match self.try_borrow_mut() { - Ok(borrow) => break Ok(borrow), - Err(error) => { - tracing::warn!("dynamic borrow failed, yielding to local tasks"); - - match rayon::yield_local() { - Some(rayon::Yield::Executed) => continue, - _ => return Err(error), - } - } - } - } - } -} - /// A trait for types that are **not** [`Send`] only because they would then allow concurrent access to a type that is not [`Sync`]. /// /// The primary example of such a type is `&T`, with `T: !Sync`. diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index 4984eb354..2da047824 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -3,10 +3,11 @@ use std::ops::DerefMut; use rayon::iter::IndexedParallelIterator; use serde_json::value::RawValue; -use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend, RefCellExt}; +use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; use crate::documents::PrimaryKey; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; use crate::update::new::document::Versions; +use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::{DocumentChange, Insertion}; use crate::{Error, InternalError, Result, UserError}; diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index 3eb0cc306..eb7252445 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -4,12 +4,13 @@ use rayon::slice::ParallelSlice as _; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use roaring::RoaringBitmap; -use super::document_changes::{DocumentChangeContext, MostlySend, RefCellExt}; +use super::document_changes::{DocumentChangeContext, MostlySend}; use super::DocumentChanges; use crate::documents::Error::InvalidDocumentFormat; use crate::documents::PrimaryKey; use crate::error::{FieldIdMapMissingEntry, InternalError}; use crate::update::new::document::Versions; +use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update}; use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; diff --git a/crates/milli/src/update/new/mod.rs b/crates/milli/src/update/new/mod.rs index 36a9a3f92..7a749228e 100644 --- a/crates/milli/src/update/new/mod.rs +++ b/crates/milli/src/update/new/mod.rs @@ -16,6 +16,7 @@ mod fst_merger_builder; pub mod indexer; mod merger; mod parallel_iterator_ext; +mod ref_cell_ext; mod top_level_map; pub mod vector_document; mod word_fst_builder; diff --git a/crates/milli/src/update/new/ref_cell_ext.rs b/crates/milli/src/update/new/ref_cell_ext.rs new file mode 100644 index 000000000..b147c00e5 --- /dev/null +++ b/crates/milli/src/update/new/ref_cell_ext.rs @@ -0,0 +1,52 @@ +use std::cell::{Ref, RefCell, RefMut}; + +pub trait RefCellExt { + fn try_borrow_or_yield(&self) -> std::result::Result, std::cell::BorrowError>; + fn try_borrow_mut_or_yield( + &self, + ) -> std::result::Result, std::cell::BorrowMutError>; + + fn borrow_or_yield(&self) -> Ref<'_, T> { + self.try_borrow_or_yield().unwrap() + } + + fn borrow_mut_or_yield(&self) -> RefMut<'_, T> { + self.try_borrow_mut_or_yield().unwrap() + } +} + +impl RefCellExt for RefCell { + fn try_borrow_or_yield(&self) -> std::result::Result, std::cell::BorrowError> { + /// TODO: move this trait and impl elsewhere + loop { + match self.try_borrow() { + Ok(borrow) => break Ok(borrow), + Err(error) => { + tracing::warn!("dynamic borrow failed, yielding to local tasks"); + match rayon::yield_local() { + Some(rayon::Yield::Executed) => continue, + _ => return Err(error), + } + } + } + } + } + + fn try_borrow_mut_or_yield( + &self, + ) -> std::result::Result, std::cell::BorrowMutError> { + loop { + match self.try_borrow_mut() { + Ok(borrow) => break Ok(borrow), + Err(error) => { + tracing::warn!("dynamic borrow failed, yielding to local tasks"); + + match rayon::yield_local() { + Some(rayon::Yield::Executed) => continue, + _ => return Err(error), + } + } + } + } + } +} diff --git a/crates/milli/src/update/new/words_prefix_docids.rs b/crates/milli/src/update/new/words_prefix_docids.rs index edc09c5f3..ffc0c5048 100644 --- a/crates/milli/src/update/new/words_prefix_docids.rs +++ b/crates/milli/src/update/new/words_prefix_docids.rs @@ -10,7 +10,7 @@ use roaring::MultiOps; use tempfile::tempfile; use thread_local::ThreadLocal; -use super::indexer::document_changes::RefCellExt; +use super::ref_cell_ext::RefCellExt as _; use crate::heed_codec::StrBEU16Codec; use crate::{CboRoaringBitmapCodec, Index, Prefix, Result}; From 01f8f30a7ad1fdaf523de4d9425998bb342bc150 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 7 Nov 2024 15:08:56 +0100 Subject: [PATCH 197/247] Fix indentation --- crates/milli/src/update/new/indexer/mod.rs | 145 ++++++++++----------- 1 file changed, 72 insertions(+), 73 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 81596f3fe..e7f0cc825 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -170,42 +170,41 @@ where // TODO manage the errors correctly let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { - let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); - let _entered = span.enter(); + let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); + let _entered = span.enter(); - let rtxn = index.read_txn()?; + let rtxn = index.read_txn()?; - // document but we need to create a function that collects and compresses documents. - let document_sender = extractor_sender.documents(); - let document_extractor = DocumentsExtractor::new(&document_sender, embedders); - let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); - let (finished_steps, step_name) = steps::extract_documents(); - extract(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; + // document but we need to create a function that collects and compresses documents. + let document_sender = extractor_sender.documents(); + let document_extractor = DocumentsExtractor::new(&document_sender, embedders); + let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); + let (finished_steps, step_name) = steps::extract_documents(); + extract(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; - - for document_extractor_data in datastore { - let document_extractor_data = document_extractor_data.0.into_inner(); - for (field, delta) in document_extractor_data.field_distribution_delta { - let current = field_distribution.entry(field).or_default(); - // adding the delta should never cause a negative result, as we are removing fields that previously existed. - *current = current.saturating_add_signed(delta); - } - document_extractor_data.docids_delta.apply_to(document_ids); + for document_extractor_data in datastore { + let document_extractor_data = document_extractor_data.0.into_inner(); + for (field, delta) in document_extractor_data.field_distribution_delta { + let current = field_distribution.entry(field).or_default(); + // adding the delta should never cause a negative result, as we are removing fields that previously existed. + *current = current.saturating_add_signed(delta); } + document_extractor_data.docids_delta.apply_to(document_ids); + } - field_distribution.retain(|_, v| *v == 0); + field_distribution.retain(|_, v| *v == 0); - const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; - let current_num_threads = rayon::current_num_threads(); - let max_memory = TEN_GIB / current_num_threads; - eprintln!("A maximum of {max_memory} bytes will be used for each of the {current_num_threads} threads"); + const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; + let current_num_threads = rayon::current_num_threads(); + let max_memory = TEN_GIB / current_num_threads; + eprintln!("A maximum of {max_memory} bytes will be used for each of the {current_num_threads} threads"); - let grenad_parameters = GrenadParameters { - max_memory: Some(max_memory), - ..GrenadParameters::default() - }; + let grenad_parameters = GrenadParameters { + max_memory: Some(max_memory), + ..GrenadParameters::default() + }; - let facet_field_ids_delta; + let facet_field_ids_delta; { let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); @@ -324,65 +323,65 @@ where )?; } - 'vectors: { - let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); - let _entered = span.enter(); + 'vectors: { + let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); + let _entered = span.enter(); - let index_embeddings = index.embedding_configs(&rtxn)?; - if index_embeddings.is_empty() { - break 'vectors; - } + let index_embeddings = index.embedding_configs(&rtxn)?; + if index_embeddings.is_empty() { + break 'vectors; + } - let embedding_sender = extractor_sender.embeddings(); - let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); - let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); - let (finished_steps, step_name) = steps::extract_embeddings(); + let embedding_sender = extractor_sender.embeddings(); + let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); + let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); + let (finished_steps, step_name) = steps::extract_embeddings(); - extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; + extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; - let mut user_provided = HashMap::new(); - for data in datastore { - let data = data.into_inner().0; - for (embedder, deladd) in data.into_iter() { - let user_provided = user_provided.entry(embedder).or_insert(Default::default()); - if let Some(del) = deladd.del { - *user_provided -= del; - } - if let Some(add) = deladd.add { - *user_provided |= add; - } + let mut user_provided = HashMap::new(); + for data in datastore { + let data = data.into_inner().0; + for (embedder, deladd) in data.into_iter() { + let user_provided = user_provided.entry(embedder).or_insert(Default::default()); + if let Some(del) = deladd.del { + *user_provided -= del; + } + if let Some(add) = deladd.add { + *user_provided |= add; } } - - embedding_sender.finish(user_provided).unwrap(); } - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); - let _entered = span.enter(); - let (finished_steps, step_name) = steps::write_db(); - (indexing_context.send_progress)(Progress { finished_steps, total_steps, step_name, finished_total_documents: None }); - } + embedding_sender.finish(user_provided).unwrap(); + } - // TODO THIS IS TOO MUCH - // - [ ] Extract fieldid docid facet number - // - [ ] Extract fieldid docid facet string - // - [ ] Extract facetid string fst - // - [ ] Extract facetid normalized string strings + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); + let _entered = span.enter(); + let (finished_steps, step_name) = steps::write_db(); + (indexing_context.send_progress)(Progress { finished_steps, total_steps, step_name, finished_total_documents: None }); + } - // TODO Inverted Indexes again - // - [x] Extract fieldid facet isempty docids - // - [x] Extract fieldid facet isnull docids - // - [x] Extract fieldid facet exists docids + // TODO THIS IS TOO MUCH + // - [ ] Extract fieldid docid facet number + // - [ ] Extract fieldid docid facet string + // - [ ] Extract facetid string fst + // - [ ] Extract facetid normalized string strings - // TODO This is the normal system - // - [x] Extract fieldid facet number docids - // - [x] Extract fieldid facet string docids + // TODO Inverted Indexes again + // - [x] Extract fieldid facet isempty docids + // - [x] Extract fieldid facet isnull docids + // - [x] Extract fieldid facet exists docids - Result::Ok(facet_field_ids_delta) - }) + // TODO This is the normal system + // - [x] Extract fieldid facet number docids + // - [x] Extract fieldid facet string docids + + Result::Ok(facet_field_ids_delta) + }) })?; let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); From 700757c01f54ec4e084539cc235e23d183c777c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 7 Nov 2024 15:32:04 +0100 Subject: [PATCH 198/247] Adding a new step --- crates/milli/src/update/new/indexer/mod.rs | 50 ++++++++++++++++------ 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index e7f0cc825..001f59fe4 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -58,6 +58,7 @@ mod steps { "extracting embeddings", "writing to database", "writing embeddings to database", + "waiting for extractors", "post-processing facets", "post-processing words", "finalizing", @@ -99,16 +100,21 @@ mod steps { step(6) } - pub const fn post_processing_facets() -> (u16, &'static str) { + pub const fn waiting_extractors() -> (u16, &'static str) { step(7) } - pub const fn post_processing_words() -> (u16, &'static str) { + + pub const fn post_processing_facets() -> (u16, &'static str) { step(8) } - pub const fn finalizing() -> (u16, &'static str) { + pub const fn post_processing_words() -> (u16, &'static str) { step(9) } + + pub const fn finalizing() -> (u16, &'static str) { + step(10) + } } /// This is the main function of this crate. @@ -169,7 +175,7 @@ where let document_ids = &mut document_ids; // TODO manage the errors correctly let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { - pool.in_place_scope(|_s| { + let result = pool.in_place_scope(|_s| { let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); let _entered = span.enter(); @@ -231,7 +237,15 @@ where exact_word_docids, word_position_docids, fid_word_count_docids, - } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?; + } = WordDocidsExtractors::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + finished_steps, + total_steps, + step_name, + )?; // TODO Word Docids Merger // extractor_sender.send_searchable::(word_docids).unwrap(); @@ -358,13 +372,6 @@ where embedding_sender.finish(user_provided).unwrap(); } - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); - let _entered = span.enter(); - let (finished_steps, step_name) = steps::write_db(); - (indexing_context.send_progress)(Progress { finished_steps, total_steps, step_name, finished_total_documents: None }); - } - // TODO THIS IS TOO MUCH // - [ ] Extract fieldid docid facet number // - [ ] Extract fieldid docid facet string @@ -381,7 +388,16 @@ where // - [x] Extract fieldid facet string docids Result::Ok(facet_field_ids_delta) - }) + }); + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); + let _entered = span.enter(); + let (finished_steps, step_name) = steps::write_db(); + (indexing_context.send_progress)(Progress { finished_steps, total_steps, step_name, finished_total_documents: None }); + } + + result })?; let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); @@ -494,6 +510,14 @@ where } } + let (finished_steps, step_name) = steps::waiting_extractors(); + (indexing_context.send_progress)(Progress { + finished_steps, + total_steps, + step_name, + finished_total_documents: None, + }); + let facet_field_ids_delta = extractor_handle.join().unwrap()?; let (finished_steps, step_name) = steps::post_processing_facets(); From 786453058911866f57c9ee80451b2d0984539e60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 7 Nov 2024 16:39:14 +0100 Subject: [PATCH 199/247] Make the word prefix integer multi-threaded --- .../src/update/new/words_prefix_docids.rs | 115 ++++++++++++++---- 1 file changed, 88 insertions(+), 27 deletions(-) diff --git a/crates/milli/src/update/new/words_prefix_docids.rs b/crates/milli/src/update/new/words_prefix_docids.rs index ffc0c5048..5454d815e 100644 --- a/crates/milli/src/update/new/words_prefix_docids.rs +++ b/crates/milli/src/update/new/words_prefix_docids.rs @@ -4,7 +4,7 @@ use std::io::{BufReader, BufWriter, Read, Seek, Write}; use hashbrown::HashMap; use heed::types::Bytes; -use heed::{BytesDecode, Database, RoTxn, RwTxn}; +use heed::{BytesDecode, Database, Error, RoTxn, RwTxn}; use rayon::iter::{IntoParallelIterator, ParallelIterator as _}; use roaring::MultiOps; use tempfile::tempfile; @@ -171,35 +171,54 @@ impl WordPrefixIntegerDocids { prefixes: &HashSet, ) -> Result<()> { // We fetch the docids associated to the newly added word prefix fst only. - // We use a HashMap to store the docids associated to each position, may be RAM consuming. - let mut integer_docids = HashMap::new(); - let mut key_buffer = Vec::new(); - for prefix in prefixes { - let prefix = prefix.as_bytes(); - for result in self.database.prefix_iter(wtxn, prefix)? { - let (key, data) = result?; - let (_word, pos) = - StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?; + // And collect the CboRoaringBitmaps pointers in an HashMap. + let frozen = FrozenPrefixIntegerBitmaps::from_prefixes(self.database, wtxn, prefixes)?; - match integer_docids.get_mut(&pos) { - Some(docids) => { - *docids |= &data; - } - None => { - integer_docids.insert(pos, data); - } - } + // We access this HashMap in parallel to compute the *union* of all + // of them and *serialize* them into files. There is one file by CPU. + let local_entries = ThreadLocal::with_capacity(rayon::current_num_threads()); + prefixes.into_par_iter().map(AsRef::as_ref).try_for_each(|prefix| { + let refcell = local_entries.get_or_try(|| { + tempfile().map(BufWriter::new).map(|f| RefCell::new((Vec::new(), f, Vec::new()))) + })?; + + let mut refmut = refcell.borrow_mut_or_yield(); + let (ref mut index, ref mut file, ref mut buffer) = *refmut; + + for (&pos, bitmaps_bytes) in frozen.bitmaps(prefix).unwrap() { + let output = bitmaps_bytes + .iter() + .map(|bytes| CboRoaringBitmapCodec::deserialize_from(bytes)) + .union()?; + + buffer.clear(); + CboRoaringBitmapCodec::serialize_into(&output, buffer); + index.push(PrefixIntegerEntry { prefix, pos, serialized_length: buffer.len() }); + file.write_all(buffer)?; } - for (pos, docids) in integer_docids.iter_mut() { - if !docids.is_empty() { - key_buffer.clear(); - key_buffer.extend_from_slice(prefix); - key_buffer.push(0); - key_buffer.extend_from_slice(&pos.to_be_bytes()); - self.prefix_database.put(wtxn, &key_buffer, docids)?; - } - docids.clear(); + Result::Ok(()) + })?; + + drop(frozen); + + // We iterate over all the collected and serialized bitmaps through + // the files and entries to eventually put them in the final database. + let mut key_buffer = Vec::new(); + for refcell in local_entries { + let (index, file, mut buffer) = refcell.into_inner(); + let mut file = file.into_inner().map_err(|e| e.into_error())?; + file.rewind()?; + let mut file = BufReader::new(file); + for PrefixIntegerEntry { prefix, pos, serialized_length } in index { + buffer.resize(serialized_length, 0); + file.read_exact(&mut buffer)?; + + key_buffer.clear(); + key_buffer.extend_from_slice(prefix.as_bytes()); + key_buffer.push(0); + key_buffer.extend_from_slice(&pos.to_be_bytes()); + self.prefix_database.remap_data_type::().put(wtxn, &key_buffer, &buffer)?; } } @@ -207,6 +226,48 @@ impl WordPrefixIntegerDocids { } } +/// Represents a prefix and the lenght the bitmap takes on disk. +struct PrefixIntegerEntry<'a> { + prefix: &'a str, + pos: u16, + serialized_length: usize, +} + +/// TODO doc +struct FrozenPrefixIntegerBitmaps<'a, 'rtxn> { + prefixes_bitmaps: HashMap<&'a str, HashMap>>, +} + +impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> { + #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] + pub fn from_prefixes( + database: Database, + rtxn: &'rtxn RoTxn, + prefixes: &'a HashSet, + ) -> heed::Result { + let database = database.remap_data_type::(); + + let mut prefixes_bitmaps = HashMap::new(); + for prefix in prefixes { + let mut positions = HashMap::new(); + for result in database.prefix_iter(rtxn, prefix.as_bytes())? { + let (key, bytes) = result?; + let (_word, pos) = StrBEU16Codec::bytes_decode(key).map_err(Error::Decoding)?; + positions.entry(pos).or_insert_with(Vec::new).push(bytes); + } + assert!(prefixes_bitmaps.insert(prefix.as_str(), positions).is_none()); + } + + Ok(Self { prefixes_bitmaps }) + } + + pub fn bitmaps(&self, key: &'a str) -> Option<&HashMap>> { + self.prefixes_bitmaps.get(&key) + } +} + +unsafe impl<'a, 'rtxn> Sync for FrozenPrefixIntegerBitmaps<'a, 'rtxn> {} + #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] fn delete_prefixes( wtxn: &mut RwTxn, From 1f5d80127162db8ab6f7b4004d32dc1b09a57898 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 7 Nov 2024 17:22:30 +0100 Subject: [PATCH 200/247] Fix crashes in facet search indexing --- .../src/update/new/extract/faceted/mod.rs | 1 - .../src/update/new/facet_search_builder.rs | 23 +++++-------------- crates/milli/src/update/new/indexer/mod.rs | 6 ++--- 3 files changed, 8 insertions(+), 22 deletions(-) diff --git a/crates/milli/src/update/new/extract/faceted/mod.rs b/crates/milli/src/update/new/extract/faceted/mod.rs index bfe8efd03..0c012d739 100644 --- a/crates/milli/src/update/new/extract/faceted/mod.rs +++ b/crates/milli/src/update/new/extract/faceted/mod.rs @@ -28,7 +28,6 @@ impl From for FacetKind { impl FacetKind { pub fn extract_from_key(key: &[u8]) -> (FacetKind, &[u8]) { - debug_assert!(key.len() > 3); (FacetKind::from(key[0]), &key[1..]) } } diff --git a/crates/milli/src/update/new/facet_search_builder.rs b/crates/milli/src/update/new/facet_search_builder.rs index 839120540..7eaec95a5 100644 --- a/crates/milli/src/update/new/facet_search_builder.rs +++ b/crates/milli/src/update/new/facet_search_builder.rs @@ -54,23 +54,12 @@ impl<'indexer> FacetSearchBuilder<'indexer> { } } - fn extract_key_data<'k>(&self, key: &'k [u8]) -> Result>> { - match FacetKind::from(key[0]) { - // Only strings are searchable - FacetKind::String => Ok(Some( - FacetGroupKeyCodec::::bytes_decode(&key[1..]) - .map_err(heed::Error::Encoding)?, - )), - _ => Ok(None), - } - } - - pub fn register_from_key(&mut self, deladd: DelAdd, facet_key: &[u8]) -> Result<()> { - let Some(FacetGroupKey { field_id, level: _level, left_bound }) = - self.extract_key_data(facet_key)? - else { - return Ok(()); - }; + pub fn register_from_key( + &mut self, + deladd: DelAdd, + facet_key: FacetGroupKey<&str>, + ) -> Result<()> { + let FacetGroupKey { field_id, level: _level, left_bound } = facet_key; if deladd == DelAdd::Addition { self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 001f59fe4..70ac7f959 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -681,13 +681,11 @@ fn compute_facet_search_database( } EitherOrBoth::Left(result) => { let (key, _) = result?; - facet_search_builder - .register_from_key(DelAdd::Deletion, key.left_bound.as_ref())?; + facet_search_builder.register_from_key(DelAdd::Deletion, key)?; } EitherOrBoth::Right(result) => { let (key, _) = result?; - facet_search_builder - .register_from_key(DelAdd::Addition, key.left_bound.as_ref())?; + facet_search_builder.register_from_key(DelAdd::Addition, key)?; } } } From d97af4d8e6823c67f743b37c2ffdb65deb319445 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 7 Nov 2024 16:04:23 +0100 Subject: [PATCH 201/247] fix field order of JSON documents --- Cargo.lock | 2 ++ crates/meilisearch-types/Cargo.toml | 3 ++- crates/meilisearch-types/src/document_formats.rs | 10 +++++++++- crates/milli/src/update/new/indexer/mod.rs | 2 +- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b0e5978b5..c3222c7fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3538,6 +3538,7 @@ version = "1.11.0" dependencies = [ "actix-web", "anyhow", + "bumpalo", "convert_case 0.6.0", "csv", "deserr", @@ -3550,6 +3551,7 @@ dependencies = [ "meili-snap", "memmap2", "milli", + "raw-collections", "roaring", "serde", "serde-cs", diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index 0dae024f2..3bd368e7c 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true [dependencies] actix-web = { version = "4.8.0", default-features = false } anyhow = "1.0.86" +bumpalo = "3.16.0" convert_case = "0.6.0" csv = "1.3.0" deserr = { version = "0.6.2", features = ["actix-web"] } @@ -23,6 +24,7 @@ flate2 = "1.0.30" fst = "0.4.7" memmap2 = "0.9.4" milli = { path = "../milli" } +raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" } roaring = { version = "0.10.6", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde-cs = "0.2.4" @@ -70,4 +72,3 @@ swedish-recomposition = ["milli/swedish-recomposition"] german = ["milli/german"] # allow turkish normalization turkish = ["milli/turkish"] - diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index b40c4d0b6..db893f880 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -3,13 +3,16 @@ use std::fs::File; use std::io::{self, BufWriter}; use std::marker::PhantomData; +use bumpalo::Bump; use memmap2::Mmap; use milli::documents::Error; use milli::update::new::TopLevelMap; use milli::Object; +use raw_collections::RawMap; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; use serde_json::error::Category; +use serde_json::value::RawValue; use serde_json::{to_writer, Map, Value}; use crate::error::{Code, ErrorCode}; @@ -213,10 +216,15 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { // We memory map to be able to deserailize into a TopLevelMap<'pl> that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; + let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB let mut out = BufWriter::new(output); let mut deserializer = serde_json::Deserializer::from_slice(&input); - let count = match array_each(&mut deserializer, |obj: TopLevelMap| to_writer(&mut out, &obj)) { + let count = match array_each(&mut deserializer, |obj: &RawValue| { + doc_alloc.reset(); + let map = RawMap::from_raw_value(obj, &doc_alloc)?; + to_writer(&mut out, &map) + }) { // The json data has been deserialized and does not need to be processed again. // The data has been transferred to the writer during the deserialization process. Ok(Ok(count)) => count, diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 70ac7f959..3b66c2ec0 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -198,7 +198,7 @@ where document_extractor_data.docids_delta.apply_to(document_ids); } - field_distribution.retain(|_, v| *v == 0); + field_distribution.retain(|_, v| *v != 0); const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; let current_num_threads = rayon::current_num_threads(); From 4706a0eb49e4e1861f3ed37773b17abf33c93067 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 7 Nov 2024 22:35:06 +0100 Subject: [PATCH 202/247] Fix vector parsing --- .../milli/src/update/new/vector_document.rs | 4 +- crates/milli/src/vector/parsed_vectors.rs | 91 ++++++++++++++++++- 2 files changed, 90 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index dc73c5268..4a27361a9 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -167,7 +167,7 @@ fn entry_from_raw_value( value: &RawValue, has_configured_embedder: bool, ) -> std::result::Result, serde_json::Error> { - let value: RawVectors = serde_json::from_str(value.get())?; + let value: RawVectors = RawVectors::from_raw_value(value)?; Ok(match value { RawVectors::Explicit(raw_explicit_vectors) => VectorEntry { @@ -177,7 +177,7 @@ fn entry_from_raw_value( }, RawVectors::ImplicitlyUserProvided(value) => VectorEntry { has_configured_embedder, - embeddings: Some(Embeddings::FromJsonImplicityUserProvided(value)), + embeddings: value.map(Embeddings::FromJsonImplicityUserProvided), regenerate: false, }, }) diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index 40e823f17..6ae6c1c9e 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -12,11 +12,96 @@ use crate::{DocumentId, FieldId, InternalError, UserError}; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Serialize, Debug)] #[serde(untagged)] pub enum RawVectors<'doc> { Explicit(#[serde(borrow)] RawExplicitVectors<'doc>), - ImplicitlyUserProvided(#[serde(borrow)] &'doc RawValue), + ImplicitlyUserProvided(#[serde(borrow)] Option<&'doc RawValue>), +} + +impl<'doc> RawVectors<'doc> { + pub fn from_raw_value(raw: &'doc RawValue) -> Result { + use serde::de::Deserializer as _; + Ok(match raw.deserialize_any(RawVectorsVisitor)? { + RawVectorsVisitorValue::ImplicitNone => RawVectors::ImplicitlyUserProvided(None), + RawVectorsVisitorValue::Implicit => RawVectors::ImplicitlyUserProvided(Some(raw)), + RawVectorsVisitorValue::Explicit { regenerate, embeddings } => { + RawVectors::Explicit(RawExplicitVectors { embeddings, regenerate }) + } + }) + } +} + +struct RawVectorsVisitor; + +enum RawVectorsVisitorValue<'doc> { + ImplicitNone, + Implicit, + Explicit { regenerate: bool, embeddings: Option<&'doc RawValue> }, +} + +impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor { + type Value = RawVectorsVisitorValue<'doc>; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "a map containing at least `regenerate`, or an array of floats`") + } + + fn visit_none(self) -> Result + where + E: serde::de::Error, + { + Ok(RawVectorsVisitorValue::ImplicitNone) + } + + fn visit_some(self, deserializer: D) -> Result + where + D: serde::Deserializer<'doc>, + { + deserializer.deserialize_any(self) + } + + fn visit_unit(self) -> Result + where + E: serde::de::Error, + { + Ok(RawVectorsVisitorValue::ImplicitNone) + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: serde::de::SeqAccess<'doc>, + { + // must consume all elements or parsing fails + while let Some(_) = seq.next_element::<&RawValue>()? {} + Ok(RawVectorsVisitorValue::Implicit) + } + + fn visit_map(self, mut map: A) -> Result + where + A: serde::de::MapAccess<'doc>, + { + use serde::de::Error as _; + let mut regenerate = None; + let mut embeddings = None; + while let Some(s) = map.next_key()? { + match s { + "regenerate" => { + let value: bool = map.next_value()?; + regenerate = Some(value); + } + "embeddings" => { + let value: &RawValue = map.next_value()?; + embeddings = Some(value); + } + other => return Err(A::Error::unknown_field(other, &["regenerate", "embeddings"])), + } + } + let Some(regenerate) = regenerate else { + return Err(A::Error::missing_field("regenerate")); + }; + Ok(RawVectorsVisitorValue::Explicit { regenerate, embeddings }) + } } #[derive(serde::Serialize, Debug)] @@ -86,7 +171,7 @@ impl<'doc> RawVectors<'doc> { } pub fn embeddings(&self) -> Option<&'doc RawValue> { match self { - RawVectors::ImplicitlyUserProvided(embeddings) => Some(embeddings), + RawVectors::ImplicitlyUserProvided(embeddings) => *embeddings, RawVectors::Explicit(RawExplicitVectors { embeddings, regenerate: _ }) => *embeddings, } } From 8a314ab81d4998d0a18a7b00a41263b05bed5674 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Fri, 8 Nov 2024 00:05:12 +0100 Subject: [PATCH 203/247] Fix primary key fid order --- Cargo.lock | 1 + crates/index-scheduler/Cargo.toml | 1 + crates/index-scheduler/src/batch.rs | 8 +++++++- crates/milli/src/update/new/indexer/mod.rs | 17 ++++++++++++----- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c3222c7fd..30b1102b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2623,6 +2623,7 @@ dependencies = [ "meilisearch-types", "memmap2", "page_size", + "raw-collections", "rayon", "roaring", "serde", diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index 4a2913083..deaded910 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -22,6 +22,7 @@ flate2 = "1.0.30" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } page_size = "0.6.0" +raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" } rayon = "1.10.0" roaring = { version = "0.10.6", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index bd307b19e..4ae8c7d46 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -43,6 +43,7 @@ use meilisearch_types::milli::{self, Filter}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; +use raw_collections::RawMap; use roaring::RoaringBitmap; use time::macros::format_description; use time::OffsetDateTime; @@ -1318,7 +1319,12 @@ impl IndexScheduler { index, &mut new_fields_ids_map, primary_key.as_deref(), - first_document.as_ref(), + first_document + .map(|raw| RawMap::from_raw_value(raw, &indexer_alloc)) + .transpose() + .map_err(|error| { + milli::Error::UserError(milli::UserError::SerdeJson(error)) + })?, )? .map_err(milli::Error::from)?; diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 3b66c2ec0..ca61a9b7b 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -12,6 +12,7 @@ use heed::{RoTxn, RwTxn}; use itertools::{merge_join_by, EitherOrBoth}; pub use partial_dump::PartialDump; use rand::SeedableRng as _; +use raw_collections::RawMap; use rayon::ThreadPool; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; @@ -24,7 +25,7 @@ use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder}; use super::words_prefix_docids::{ compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, }; -use super::{StdResult, TopLevelMap}; +use super::StdResult; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::facet::FacetType; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; @@ -733,7 +734,7 @@ pub fn retrieve_or_guess_primary_key<'a>( index: &Index, new_fields_ids_map: &mut FieldsIdsMap, primary_key_from_op: Option<&'a str>, - first_document: Option<&'a TopLevelMap<'a>>, + first_document: Option>, ) -> Result, bool), UserError>> { // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. @@ -769,12 +770,18 @@ pub fn retrieve_or_guess_primary_key<'a>( None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), }; - let mut guesses: Vec<&str> = first_document + let guesses: Result> = first_document .keys() - .map(AsRef::as_ref) - .filter(|name| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) + .filter_map(|name| { + let Some(_) = new_fields_ids_map.insert(name) else { + return Some(Err(UserError::AttributeLimitReached.into())); + }; + name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY).then_some(Ok(name)) + }) .collect(); + let mut guesses = guesses?; + // sort the keys in lexicographical order, so that fields are always in the same order. guesses.sort_unstable(); From 5185aa21b8a5b31bf7b9df61f3b43447e2221faa Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Fri, 8 Nov 2024 00:05:36 +0100 Subject: [PATCH 204/247] Know if your vectors are implicit when writing them back in documents + don't write empty _vectors --- crates/milli/src/update/new/document.rs | 18 +++++++++++++----- crates/milli/src/update/new/vector_document.rs | 4 ++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index f43eb63e4..692277597 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -332,14 +332,22 @@ where } vectors.insert( name, - serde_json::json!({ - "regenerate": entry.regenerate, - // TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object - "embeddings": entry.embeddings, - }), + if entry.implicit { + serde_json::json!(entry.embeddings) + } else { + serde_json::json!({ + "regenerate": entry.regenerate, + // TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object + "embeddings": entry.embeddings, + }) + }, ); } + if vectors.is_empty() { + break 'inject_vectors; + } + vectors_value = serde_json::value::to_raw_value(&vectors).unwrap(); unordered_field_buffer.push((vectors_fid, &vectors_value)); } diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 4a27361a9..e96e29053 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -71,6 +71,7 @@ pub struct VectorEntry<'doc> { pub has_configured_embedder: bool, pub embeddings: Option>, pub regenerate: bool, + pub implicit: bool, } pub trait VectorDocument<'doc> { @@ -125,6 +126,7 @@ impl<'t> VectorDocumentFromDb<'t> { has_configured_embedder: true, embeddings: Some(Embeddings::FromDb(vectors)), regenerate: !config.user_provided.contains(self.docid), + implicit: false, }) } } @@ -174,11 +176,13 @@ fn entry_from_raw_value( has_configured_embedder, embeddings: raw_explicit_vectors.embeddings.map(Embeddings::FromJsonExplicit), regenerate: raw_explicit_vectors.regenerate, + implicit: false, }, RawVectors::ImplicitlyUserProvided(value) => VectorEntry { has_configured_embedder, embeddings: value.map(Embeddings::FromJsonImplicityUserProvided), regenerate: false, + implicit: true, }, }) } From e32677999f0c02e6f3d1165205a74ab7b66d7deb Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Fri, 8 Nov 2024 00:06:33 +0100 Subject: [PATCH 205/247] Adapt some snapshots --- .../lib.rs/import_vectors/Intel to kefir.snap | 4 +- .../import_vectors/adding Intel succeeds.snap | 4 +- .../documents after initial push.snap | 4 +- crates/meilisearch/tests/search/mod.rs | 24 +-- crates/meilisearch/tests/search/multi.rs | 198 +++++++++--------- 5 files changed, 117 insertions(+), 117 deletions(-) diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap index e6d0d8232..b8b204935 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap @@ -1,5 +1,5 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/lib.rs --- ### Autobatching Enabled = true ### Processing Tasks: @@ -22,7 +22,7 @@ succeeded [0,1,] doggos [0,1,2,] ---------------------------------------------------------------------- ### Index Mapper: -doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } +doggos: { number_of_documents: 1, field_distribution: {"breed": 1, "doggo": 1, "id": 1} } ---------------------------------------------------------------------- ### Canceled By: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap index bd4cf0c09..cead3f781 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap @@ -1,5 +1,5 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/lib.rs --- ### Autobatching Enabled = true ### Processing Tasks: @@ -21,7 +21,7 @@ succeeded [0,1,] doggos [0,1,] ---------------------------------------------------------------------- ### Index Mapper: -doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } +doggos: { number_of_documents: 1, field_distribution: {"breed": 1, "doggo": 1, "id": 1} } ---------------------------------------------------------------------- ### Canceled By: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap index d2473d00a..e06d09464 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap @@ -1,4 +1,4 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/lib.rs --- -[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"embeddings":[2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0],"regenerate":false},"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"embeddings":[3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0],"regenerate":true}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"embeddings":null,"regenerate":true}}}] +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],"unknown embedder":[1,2,3]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"regenerate":false,"embeddings":[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]},"unknown embedder":[4,5]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"regenerate":true,"embeddings":[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"regenerate":true,"embeddings":null}}}] diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index d1091d944..afac667bb 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -750,9 +750,9 @@ async fn test_score_details() { ], "_vectors": { "manual": [ - -100.0, - 231.0, - 32.0 + -100, + 231, + 32 ] }, "_rankingScoreDetails": { @@ -1543,9 +1543,9 @@ async fn simple_search_with_strange_synonyms() { ], "_vectors": { "manual": [ - -100.0, - 231.0, - 32.0 + -100, + 231, + 32 ] } } @@ -1568,9 +1568,9 @@ async fn simple_search_with_strange_synonyms() { ], "_vectors": { "manual": [ - -100.0, - 231.0, - 32.0 + -100, + 231, + 32 ] } } @@ -1593,9 +1593,9 @@ async fn simple_search_with_strange_synonyms() { ], "_vectors": { "manual": [ - -100.0, - 231.0, - 32.0 + -100, + 231, + 32 ] } } diff --git a/crates/meilisearch/tests/search/multi.rs b/crates/meilisearch/tests/search/multi.rs index eaa1da15f..932751b49 100644 --- a/crates/meilisearch/tests/search/multi.rs +++ b/crates/meilisearch/tests/search/multi.rs @@ -113,9 +113,9 @@ async fn simple_search_single_index() { ], "_vectors": { "manual": [ - -100.0, - 340.0, - 90.0 + -100, + 340, + 90 ] } } @@ -138,9 +138,9 @@ async fn simple_search_single_index() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] } } @@ -182,9 +182,9 @@ async fn federation_single_search_single_index() { ], "_vectors": { "manual": [ - -100.0, - 340.0, - 90.0 + -100, + 340, + 90 ] }, "_federation": { @@ -305,9 +305,9 @@ async fn federation_two_search_single_index() { ], "_vectors": { "manual": [ - -100.0, - 340.0, - 90.0 + -100, + 340, + 90 ] }, "_federation": { @@ -325,9 +325,9 @@ async fn federation_two_search_single_index() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -480,9 +480,9 @@ async fn simple_search_two_indexes() { ], "_vectors": { "manual": [ - -100.0, - 340.0, - 90.0 + -100, + 340, + 90 ] } } @@ -513,9 +513,9 @@ async fn simple_search_two_indexes() { "cattos": "pésti", "_vectors": { "manual": [ - 1.0, - 2.0, - 3.0 + 1, + 2, + 3 ] } }, @@ -535,9 +535,9 @@ async fn simple_search_two_indexes() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] } } @@ -1393,9 +1393,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { "cattos": "pésti", "_vectors": { "manual": [ - 1.0, - 2.0, - 3.0 + 1, + 2, + 3 ] }, "_federation": { @@ -1414,9 +1414,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - 10.0, - 23.0, - 32.0 + 10, + 23, + 32 ] }, "_federation": { @@ -1442,9 +1442,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -1474,9 +1474,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - 10.0, - 23.0, - 32.0 + 10, + 23, + 32 ] }, "_federation": { @@ -1716,9 +1716,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { "cattos": "pésti", "_vectors": { "manual": [ - 1.0, - 2.0, - 3.0 + 1, + 2, + 3 ] }, "_federation": { @@ -1748,9 +1748,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { ], "_vectors": { "manual": [ - 10.0, - 23.0, - 32.0 + 10, + 23, + 32 ] }, "_federation": { @@ -1769,9 +1769,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { ], "_vectors": { "manual": [ - 10.0, - 23.0, - 32.0 + 10, + 23, + 32 ] }, "_federation": { @@ -1797,9 +1797,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -2103,9 +2103,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -2124,9 +2124,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - 10.0, - -23.0, - 32.0 + 10, + -23, + 32 ] }, "_federation": { @@ -2145,9 +2145,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - -100.0, - 340.0, - 90.0 + -100, + 340, + 90 ] }, "_federation": { @@ -2166,9 +2166,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - -100.0, - 231.0, - 32.0 + -100, + 231, + 32 ] }, "_federation": { @@ -2187,9 +2187,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 3.0 + 1, + 2, + 3 ] }, "_federation": { @@ -2228,9 +2228,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -2415,9 +2415,9 @@ async fn federation_sort_different_ranking_rules() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -2436,9 +2436,9 @@ async fn federation_sort_different_ranking_rules() { ], "_vectors": { "manual": [ - 10.0, - -23.0, - 32.0 + 10, + -23, + 32 ] }, "_federation": { @@ -2457,9 +2457,9 @@ async fn federation_sort_different_ranking_rules() { ], "_vectors": { "manual": [ - -100.0, - 340.0, - 90.0 + -100, + 340, + 90 ] }, "_federation": { @@ -2478,9 +2478,9 @@ async fn federation_sort_different_ranking_rules() { ], "_vectors": { "manual": [ - -100.0, - 231.0, - 32.0 + -100, + 231, + 32 ] }, "_federation": { @@ -2499,9 +2499,9 @@ async fn federation_sort_different_ranking_rules() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 3.0 + 1, + 2, + 3 ] }, "_federation": { @@ -2716,9 +2716,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -2757,9 +2757,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() ], "_vectors": { "manual": [ - 10.0, - -23.0, - 32.0 + 10, + -23, + 32 ] }, "_federation": { @@ -2778,9 +2778,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() ], "_vectors": { "manual": [ - -100.0, - 340.0, - 90.0 + -100, + 340, + 90 ] }, "_federation": { @@ -2799,9 +2799,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() ], "_vectors": { "manual": [ - -100.0, - 231.0, - 32.0 + -100, + 231, + 32 ] }, "_federation": { @@ -2820,9 +2820,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() ], "_vectors": { "manual": [ - 1.0, - 2.0, - 3.0 + 1, + 2, + 3 ] }, "_federation": { @@ -2881,9 +2881,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { From bef8fc6cf14ae3078ec5d0af19fb352637cda443 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Fri, 8 Nov 2024 13:10:17 +0100 Subject: [PATCH 206/247] Fix hf embedder --- crates/milli/src/vector/hf.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/vector/hf.rs b/crates/milli/src/vector/hf.rs index ea892ca57..3fe28e53a 100644 --- a/crates/milli/src/vector/hf.rs +++ b/crates/milli/src/vector/hf.rs @@ -183,14 +183,17 @@ impl Embedder { let token_ids = if token_ids.len() > 512 { &token_ids[..512] } else { token_ids }; let token_ids = Tensor::new(token_ids, &self.model.device).map_err(EmbedError::tensor_shape)?; + let token_ids = Tensor::stack(&[token_ids], 0).map_err(EmbedError::tensor_shape)?; let token_type_ids = token_ids.zeros_like().map_err(EmbedError::tensor_shape)?; let embeddings = self.model.forward(&token_ids, &token_type_ids).map_err(EmbedError::model_forward)?; // Apply some avg-pooling by taking the mean embedding value for all tokens (including padding) - let (n_tokens, _hidden_size) = embeddings.dims2().map_err(EmbedError::tensor_shape)?; - let embedding = (embeddings.sum(0).map_err(EmbedError::tensor_value)? / (n_tokens as f64)) + let (_n_sentence, n_tokens, _hidden_size) = + embeddings.dims3().map_err(EmbedError::tensor_shape)?; + let embedding = (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64)) .map_err(EmbedError::tensor_shape)?; + let embedding = embedding.squeeze(0).map_err(EmbedError::tensor_shape)?; let embedding: Embedding = embedding.to_vec1().map_err(EmbedError::tensor_shape)?; Ok(embedding) } From 6094bb299ab2ab72087670cb5e28e133b853e280 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 10:15:55 +0100 Subject: [PATCH 207/247] Fix user_provided vectors --- crates/milli/src/update/new/channel.rs | 12 +++---- .../src/update/new/extract/vectors/mod.rs | 13 +++++--- crates/milli/src/update/new/indexer/mod.rs | 33 +++++-------------- 3 files changed, 22 insertions(+), 36 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index a4896ee3f..5b91ae77f 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -2,12 +2,11 @@ use std::marker::PhantomData; use std::sync::atomic::{AtomicUsize, Ordering}; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; -use hashbrown::HashMap; use heed::types::Bytes; -use roaring::RoaringBitmap; use super::extract::FacetKind; use super::StdResult; +use crate::index::IndexEmbeddingConfig; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; use crate::{DocumentId, Index}; @@ -87,7 +86,7 @@ pub enum ArroyOperation { embedding: Embedding, }, Finish { - user_provided: HashMap, + configs: Vec, }, } @@ -418,12 +417,9 @@ impl EmbeddingSender<'_> { } /// Marks all embedders as "to be built" - pub fn finish( - self, - user_provided: HashMap, - ) -> StdResult<(), SendError<()>> { + pub fn finish(self, configs: Vec) -> StdResult<(), SendError<()>> { self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::Finish { user_provided })) + .send(WriterOperation::ArroyOperation(ArroyOperation::Finish { configs })) .map_err(|_| SendError(())) } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 55121fb14..df8e2ed09 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -85,8 +85,13 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { for change in changes { let change = change?; match change { - DocumentChange::Deletion(_deletion) => { - // handled by document sender + DocumentChange::Deletion(deletion) => { + // vector deletion is handled by document sender, + // we still need to accomodate deletion from user_provided + for chunks in &mut all_chunks { + // regenerate: true means we delete from user_provided + chunks.set_regenerate(deletion.docid(), true); + } } DocumentChange::Update(update) => { let old_vectors = update.current_vectors( @@ -423,9 +428,9 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { let user_provided = user_provided.0.entry_ref(self.embedder_name).or_default(); if regenerate { // regenerate == !user_provided - user_provided.del.get_or_insert(Default::default()).insert(docid); + user_provided.insert_del_u32(docid); } else { - user_provided.add.get_or_insert(Default::default()).insert(docid); + user_provided.insert_add_u32(docid); } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index ca61a9b7b..6d1d0eea8 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -342,35 +342,28 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); let _entered = span.enter(); - let index_embeddings = index.embedding_configs(&rtxn)?; + let mut index_embeddings = index.embedding_configs(&rtxn)?; if index_embeddings.is_empty() { break 'vectors; } let embedding_sender = extractor_sender.embeddings(); let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); - let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); + let mut datastore = ThreadLocal::with_capacity(pool.current_num_threads()); let (finished_steps, step_name) = steps::extract_embeddings(); extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; - - let mut user_provided = HashMap::new(); - for data in datastore { - let data = data.into_inner().0; - for (embedder, deladd) in data.into_iter() { - let user_provided = user_provided.entry(embedder).or_insert(Default::default()); - if let Some(del) = deladd.del { - *user_provided -= del; - } - if let Some(add) = deladd.add { - *user_provided |= add; - } + for config in &mut index_embeddings { + 'data: for data in datastore.iter_mut() { + let data = &mut data.get_mut().0; + let Some(deladd) = data.remove(&config.name) else { continue 'data; }; + deladd.apply_to(&mut config.user_provided); } } - embedding_sender.finish(user_provided).unwrap(); + embedding_sender.finish(index_embeddings).unwrap(); } // TODO THIS IS TOO MUCH @@ -472,7 +465,7 @@ where writer.del_items(wtxn, *dimensions, docid)?; writer.add_item(wtxn, docid, &embedding)?; } - ArroyOperation::Finish { mut user_provided } => { + ArroyOperation::Finish { configs } => { let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); let _entered = span.enter(); @@ -497,14 +490,6 @@ where )?; } - let mut configs = index.embedding_configs(wtxn)?; - - for config in &mut configs { - if let Some(user_provided) = user_provided.remove(&config.name) { - config.user_provided = user_provided; - } - } - index.put_embedding_configs(wtxn, configs)?; } }, From 1fcd5f091ec158e8f0bb2c2942d727ac0506c89b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 12:23:13 +0100 Subject: [PATCH 208/247] Remove progress from task --- crates/index-scheduler/src/insta_snapshot.rs | 1 - crates/index-scheduler/src/lib.rs | 10 +--------- crates/index-scheduler/src/utils.rs | 2 -- crates/meilisearch-types/src/task_view.rs | 7 +------ crates/meilisearch-types/src/tasks.rs | 2 -- 5 files changed, 2 insertions(+), 20 deletions(-) diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index f63a289eb..f295e35b6 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -148,7 +148,6 @@ pub fn snapshot_task(task: &Task) -> String { enqueued_at: _, started_at: _, finished_at: _, - progress: _, error, canceled_by, details, diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 9e78d4b48..b57a0fe9f 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -978,12 +978,7 @@ impl IndexScheduler { Ok(( ret.map(|task| { if processing.contains(task.uid) { - Task { - status: Status::Processing, - progress: progress.clone(), - started_at: Some(started_at), - ..task - } + Task { status: Status::Processing, started_at: Some(started_at), ..task } } else { task } @@ -1025,7 +1020,6 @@ impl IndexScheduler { enqueued_at: OffsetDateTime::now_utc(), started_at: None, finished_at: None, - progress: None, error: None, canceled_by: None, details: kind.default_details(), @@ -1606,8 +1600,6 @@ impl<'a> Dump<'a> { enqueued_at: task.enqueued_at, started_at: task.started_at, finished_at: task.finished_at, - /// FIXME: should we update dump to contain progress information? 🤔 - progress: None, error: task.error, canceled_by: task.canceled_by, details: task.details, diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 7ae419495..788a70fb8 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -345,8 +345,6 @@ impl IndexScheduler { enqueued_at, started_at, finished_at, - /// FIXME: assert something here? ask tamo 🤔 - progress: _, error: _, canceled_by, details, diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index fd9367bf4..3075fa899 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -4,9 +4,7 @@ use time::{Duration, OffsetDateTime}; use crate::error::ResponseError; use crate::settings::{Settings, Unchecked}; -use crate::tasks::{ - serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId, TaskProgress, -}; +use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId}; #[derive(Debug, Clone, PartialEq, Eq, Serialize)] #[serde(rename_all = "camelCase")] @@ -29,8 +27,6 @@ pub struct TaskView { pub started_at: Option, #[serde(with = "time::serde::rfc3339::option", default)] pub finished_at: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub progress: Option, } impl TaskView { @@ -47,7 +43,6 @@ impl TaskView { enqueued_at: task.enqueued_at, started_at: task.started_at, finished_at: task.finished_at, - progress: task.progress.clone(), } } } diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 56d839432..70e6ad294 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -31,8 +31,6 @@ pub struct Task { #[serde(with = "time::serde::rfc3339::option")] pub finished_at: Option, - pub progress: Option, - pub error: Option, pub canceled_by: Option, pub details: Option
, From 980921e078f05b35c9907a38843d406b4851f95c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 16:31:22 +0100 Subject: [PATCH 209/247] Vector fixes --- crates/milli/src/update/new/extract/vectors/mod.rs | 7 ++----- crates/milli/src/update/new/vector_document.rs | 8 +++++++- crates/milli/src/vector/ollama.rs | 2 +- crates/milli/src/vector/openai.rs | 2 +- crates/milli/src/vector/rest.rs | 2 +- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index df8e2ed09..514791a65 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -115,11 +115,8 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { new_vectors.vectors_for_key(embedder_name).transpose() }) { let new_vectors = new_vectors?; - match (old_vectors.regenerate, new_vectors.regenerate) { - (true, true) | (false, false) => todo!(), - _ => { - chunks.set_regenerate(update.docid(), new_vectors.regenerate); - } + if old_vectors.regenerate != new_vectors.regenerate { + chunks.set_regenerate(update.docid(), new_vectors.regenerate); } // do we have set embeddings? if let Some(embeddings) = new_vectors.embeddings { diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index e96e29053..381c4dab6 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -180,7 +180,13 @@ fn entry_from_raw_value( }, RawVectors::ImplicitlyUserProvided(value) => VectorEntry { has_configured_embedder, - embeddings: value.map(Embeddings::FromJsonImplicityUserProvided), + // implicitly user provided always provide embeddings + // `None` here means that there are no embeddings + embeddings: Some( + value + .map(Embeddings::FromJsonImplicityUserProvided) + .unwrap_or(Embeddings::FromDb(Default::default())), + ), regenerate: false, implicit: true, }, diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index 65fd05416..263d9d3c9 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -113,7 +113,7 @@ impl Embedder { threads .install(move || { let embeddings: Result>, _> = texts - .par_chunks(self.chunk_count_hint()) + .par_chunks(self.prompt_count_in_chunk_hint()) .map(move |chunk| self.embed(chunk)) .collect(); diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index 466fd1660..375b2878a 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -266,7 +266,7 @@ impl Embedder { threads .install(move || { let embeddings: Result>, _> = texts - .par_chunks(self.chunk_count_hint()) + .par_chunks(self.prompt_count_in_chunk_hint()) .map(move |chunk| self.embed(chunk)) .collect(); diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index dc2ab95f9..eeb5b16af 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -193,7 +193,7 @@ impl Embedder { threads .install(move || { let embeddings: Result>, _> = texts - .par_chunks(self.chunk_count_hint()) + .par_chunks(self.prompt_count_in_chunk_hint()) .map(move |chunk| self.embed_ref(chunk)) .collect(); From 68bbf674c9fe3641b33b867a6f43abf95c7fbe07 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 16:31:31 +0100 Subject: [PATCH 210/247] Make REST mock thread independent --- crates/meilisearch/tests/vector/rest.rs | 118 +++++++++++++++++------- 1 file changed, 83 insertions(+), 35 deletions(-) diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 2748d0846..09188595c 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -1,5 +1,4 @@ use std::collections::BTreeMap; -use std::sync::atomic::{AtomicUsize, Ordering}; use meili_snap::{json_string, snapshot}; use reqwest::IntoUrl; @@ -13,13 +12,22 @@ use crate::vector::{get_server_vector, GetAllDocumentsOptions}; async fn create_mock() -> (MockServer, Value) { let mock_server = MockServer::start().await; - let counter = AtomicUsize::new(0); + let text_to_embedding: BTreeMap<_, _> = vec![ + // text -> embedding + ("kefir", [0.0, 0.0, 0.0]), + ("intel", [1.0, 1.0, 1.0]), + ] + // turn into btree + .into_iter() + .collect(); Mock::given(method("POST")) .and(path("/")) - .respond_with(move |_req: &Request| { - let counter = counter.fetch_add(1, Ordering::Relaxed); - ResponseTemplate::new(200).set_body_json(json!({ "data": vec![counter; 3] })) + .respond_with(move |req: &Request| { + let text: String = req.body_json().unwrap(); + ResponseTemplate::new(200).set_body_json( + json!({ "data": text_to_embedding.get(text.as_str()).unwrap_or(&[99., 99., 99.]) }), + ) }) .mount(&mock_server) .await; @@ -32,13 +40,14 @@ async fn create_mock() -> (MockServer, Value) { "request": "{{text}}", "response": { "data": "{{embedding}}" - } + }, + "documentTemplate": "{{doc.name}}", }); (mock_server, embedder_settings) } -async fn create_mock_map() -> (MockServer, Value) { +async fn create_mock_default_template() -> (MockServer, Value) { let mock_server = MockServer::start().await; let text_to_embedding: BTreeMap<_, _> = vec![ @@ -97,7 +106,14 @@ struct SingleResponse { async fn create_mock_multiple() -> (MockServer, Value) { let mock_server = MockServer::start().await; - let counter = AtomicUsize::new(0); + let text_to_embedding: BTreeMap<_, _> = vec![ + // text -> embedding + ("kefir", [0.0, 0.0, 0.0]), + ("intel", [1.0, 1.0, 1.0]), + ] + // turn into btree + .into_iter() + .collect(); Mock::given(method("POST")) .and(path("/")) @@ -115,8 +131,11 @@ async fn create_mock_multiple() -> (MockServer, Value) { .input .into_iter() .map(|text| SingleResponse { + embedding: text_to_embedding + .get(text.as_str()) + .unwrap_or(&[99., 99., 99.]) + .to_vec(), text, - embedding: vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3], }) .collect(); @@ -142,7 +161,8 @@ async fn create_mock_multiple() -> (MockServer, Value) { }, "{{..}}" ] - } + }, + "documentTemplate": "{{doc.name}}" }); (mock_server, embedder_settings) @@ -156,7 +176,14 @@ struct SingleRequest { async fn create_mock_single_response_in_array() -> (MockServer, Value) { let mock_server = MockServer::start().await; - let counter = AtomicUsize::new(0); + let text_to_embedding: BTreeMap<_, _> = vec![ + // text -> embedding + ("kefir", [0.0, 0.0, 0.0]), + ("intel", [1.0, 1.0, 1.0]), + ] + // turn into btree + .into_iter() + .collect(); Mock::given(method("POST")) .and(path("/")) @@ -171,8 +198,11 @@ async fn create_mock_single_response_in_array() -> (MockServer, Value) { }; let output = vec![SingleResponse { + embedding: text_to_embedding + .get(req.input.as_str()) + .unwrap_or(&[99., 99., 99.]) + .to_vec(), text: req.input, - embedding: vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3], }]; let response = MultipleResponse { output }; @@ -196,7 +226,8 @@ async fn create_mock_single_response_in_array() -> (MockServer, Value) { "embedding": "{{embedding}}" } ] - } + }, + "documentTemplate": "{{doc.name}}" }); (mock_server, embedder_settings) @@ -205,7 +236,14 @@ async fn create_mock_single_response_in_array() -> (MockServer, Value) { async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { let mock_server = MockServer::start().await; - let counter = AtomicUsize::new(0); + let text_to_embedding: BTreeMap<_, _> = vec![ + // text -> embedding + ("kefir", [0.0, 0.0, 0.0]), + ("intel", [1.0, 1.0, 1.0]), + ] + // turn into btree + .into_iter() + .collect(); Mock::given(method("POST")) .and(path("/")) @@ -223,7 +261,7 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { } } - let _req: String = match req.body_json() { + let req: String = match req.body_json() { Ok(req) => req, Err(error) => { return ResponseTemplate::new(400).set_body_json(json!({ @@ -232,7 +270,7 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { } }; - let output = vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3]; + let output = text_to_embedding.get(req.as_str()).unwrap_or(&[99., 99., 99.]).to_vec(); ResponseTemplate::new(200).set_body_json(output) }) @@ -245,7 +283,8 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { "url": url, "request": "{{text}}", "response": "{{embedding}}", - "headers": {"my-nonstandard-auth": "bearer of the ring"} + "headers": {"my-nonstandard-auth": "bearer of the ring"}, + "documentTemplate": "{{doc.name}}" }); (mock_server, embedder_settings) @@ -254,12 +293,19 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { async fn create_mock_raw() -> (MockServer, Value) { let mock_server = MockServer::start().await; - let counter = AtomicUsize::new(0); + let text_to_embedding: BTreeMap<_, _> = vec![ + // text -> embedding + ("kefir", [0.0, 0.0, 0.0]), + ("intel", [1.0, 1.0, 1.0]), + ] + // turn into btree + .into_iter() + .collect(); Mock::given(method("POST")) .and(path("/")) .respond_with(move |req: &Request| { - let _req: String = match req.body_json() { + let req: String = match req.body_json() { Ok(req) => req, Err(error) => { return ResponseTemplate::new(400).set_body_json(json!({ @@ -268,7 +314,7 @@ async fn create_mock_raw() -> (MockServer, Value) { } }; - let output = vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3]; + let output = text_to_embedding.get(req.as_str()).unwrap_or(&[99., 99., 99.]).to_vec(); ResponseTemplate::new(200).set_body_json(output) }) @@ -281,29 +327,30 @@ async fn create_mock_raw() -> (MockServer, Value) { "url": url, "dimensions": 3, "request": "{{text}}", - "response": "{{embedding}}" + "response": "{{embedding}}", + "documentTemplate": "{{doc.name}}" }); (mock_server, embedder_settings) } -pub async fn post(url: T) -> reqwest::Result { - reqwest::Client::builder().build()?.post(url).send().await +pub async fn post(url: T, text: &str) -> reqwest::Result { + reqwest::Client::builder().build()?.post(url).json(&json!(text)).send().await } #[actix_rt::test] async fn dummy_testing_the_mock() { let (mock, _setting) = create_mock().await; - let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); - snapshot!(body, @r###"{"data":[0,0,0]}"###); - let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); - snapshot!(body, @r###"{"data":[1,1,1]}"###); - let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); - snapshot!(body, @r###"{"data":[2,2,2]}"###); - let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); - snapshot!(body, @r###"{"data":[3,3,3]}"###); - let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); - snapshot!(body, @r###"{"data":[4,4,4]}"###); + let body = post(&mock.uri(), "kefir").await.unwrap().text().await.unwrap(); + snapshot!(body, @r###"{"data":[0.0,0.0,0.0]}"###); + let body = post(&mock.uri(), "intel").await.unwrap().text().await.unwrap(); + snapshot!(body, @r###"{"data":[1.0,1.0,1.0]}"###); + let body = post(&mock.uri(), "kefir").await.unwrap().text().await.unwrap(); + snapshot!(body, @r###"{"data":[0.0,0.0,0.0]}"###); + let body = post(&mock.uri(), "kefir").await.unwrap().text().await.unwrap(); + snapshot!(body, @r###"{"data":[0.0,0.0,0.0]}"###); + let body = post(&mock.uri(), "intel").await.unwrap().text().await.unwrap(); + snapshot!(body, @r###"{"data":[1.0,1.0,1.0]}"###); } #[actix_rt::test] @@ -953,7 +1000,7 @@ async fn bad_settings() { let (response, code) = index .update_settings(json!({ "embedders": { - "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": { "data": "{{embedding}}" }, "dimensions": 2 }), + "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": { "data": "{{embedding}}" }, "dimensions": 2, "documentTemplate": "{{doc.name}}" }), }, })) .await; @@ -1920,6 +1967,7 @@ async fn server_custom_header() { "embedders": { "rest": { "source": "rest", + "documentTemplate": "{{doc.name}}", "url": "[url]", "request": "{{text}}", "response": "{{embedding}}", @@ -1940,7 +1988,7 @@ async fn server_custom_header() { #[actix_rt::test] async fn searchable_reindex() { - let (_mock, setting) = create_mock_map().await; + let (_mock, setting) = create_mock_default_template().await; let server = get_server_vector().await; let index = server.index("doggo"); From 8a6e61c77f15413bdb2f6244bf67c763c84116f9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 22:47:57 +0100 Subject: [PATCH 211/247] InvalidVectorsEmbedderConf error takes a String rather than a deserr error --- crates/milli/src/error.rs | 2 +- crates/milli/src/vector/parsed_vectors.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 53d9827ac..3a9c81e10 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -122,7 +122,7 @@ and can not be more than 512 bytes.", .document_id.to_string() #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] InvalidVectorsMapType { document_id: String, value: Value }, #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")] - InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError }, + InvalidVectorsEmbedderConf { document_id: String, error: String }, #[error("{0}")] InvalidFilter(String), #[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))] diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index 6ae6c1c9e..5f8b30f1f 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -343,7 +343,7 @@ impl Error { Error::InvalidEmbedderConf { error } => { crate::Error::UserError(UserError::InvalidVectorsEmbedderConf { document_id, - error, + error: error.to_string(), }) } Error::InternalSerdeJson(error) => { From c4e9f761e95f900a8d6648c34cdea7668b4acb6b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 22:49:22 +0100 Subject: [PATCH 212/247] Emit better error messages when parsing vectors --- .../milli/src/update/new/document_change.rs | 20 +- .../src/update/new/extract/vectors/mod.rs | 4 +- .../milli/src/update/new/vector_document.rs | 69 +++++-- crates/milli/src/vector/parsed_vectors.rs | 186 ++++++++++++++++-- 4 files changed, 242 insertions(+), 37 deletions(-) diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 4a61c110d..899655db1 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -97,7 +97,7 @@ impl<'doc> Insertion<'doc> { doc_alloc: &'doc Bump, embedders: &'doc EmbeddingConfigs, ) -> Result>> { - VectorDocumentFromVersions::new(&self.new, doc_alloc, embedders) + VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders) } } @@ -169,7 +169,7 @@ impl<'doc> Update<'doc> { doc_alloc: &'doc Bump, embedders: &'doc EmbeddingConfigs, ) -> Result>> { - VectorDocumentFromVersions::new(&self.new, doc_alloc, embedders) + VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders) } pub fn merged_vectors( @@ -181,10 +181,22 @@ impl<'doc> Update<'doc> { embedders: &'doc EmbeddingConfigs, ) -> Result>> { if self.has_deletion { - MergedVectorDocument::without_db(&self.new, doc_alloc, embedders) + MergedVectorDocument::without_db( + self.external_document_id, + &self.new, + doc_alloc, + embedders, + ) } else { MergedVectorDocument::with_db( - self.docid, index, rtxn, mapper, &self.new, doc_alloc, embedders, + self.docid, + self.external_document_id, + index, + rtxn, + mapper, + &self.new, + doc_alloc, + embedders, ) } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 514791a65..efb02b2ab 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -126,7 +126,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { .into_vec(&context.doc_alloc, embedder_name) .map_err(|error| UserError::InvalidVectorsEmbedderConf { document_id: update.external_document_id().to_string(), - error, + error: error.to_string(), })?, ); } else if new_vectors.regenerate { @@ -210,7 +210,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { document_id: insertion .external_document_id() .to_string(), - error, + error: error.to_string(), })?, ); } else if new_vectors.regenerate { diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 381c4dab6..736456f0f 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -12,7 +12,7 @@ use super::indexer::de::DeserrRawValue; use crate::documents::FieldIdMapper; use crate::index::IndexEmbeddingConfig; use crate::vector::parsed_vectors::{ - RawVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, + RawVectors, RawVectorsError, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs}; use crate::{DocumentId, Index, InternalError, Result, UserError}; @@ -143,7 +143,14 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { Ok((&*config_name, entry)) }) .chain(self.vectors_field.iter().flat_map(|map| map.iter()).map(|(name, value)| { - Ok((name, entry_from_raw_value(value, false).map_err(InternalError::SerdeJson)?)) + Ok(( + name, + entry_from_raw_value(value, false).map_err(|_| { + InternalError::Serialization(crate::SerializationError::Decoding { + db_name: Some(crate::index::db_name::VECTOR_ARROY), + }) + })?, + )) })) } @@ -155,20 +162,38 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { Some(self.entry_from_db(embedder_id, config)?) } None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) { - Some(embedding_from_doc) => Some( - entry_from_raw_value(embedding_from_doc, false) - .map_err(InternalError::SerdeJson)?, - ), + Some(embedding_from_doc) => { + Some(entry_from_raw_value(embedding_from_doc, false).map_err(|_| { + InternalError::Serialization(crate::SerializationError::Decoding { + db_name: Some(crate::index::db_name::VECTOR_ARROY), + }) + })?) + } None => None, }, }) } } +fn entry_from_raw_value_user<'doc>( + external_docid: &str, + embedder_name: &str, + value: &'doc RawValue, + has_configured_embedder: bool, +) -> Result> { + entry_from_raw_value(value, has_configured_embedder).map_err(|error| { + UserError::InvalidVectorsEmbedderConf { + document_id: external_docid.to_string(), + error: error.msg(embedder_name), + } + .into() + }) +} + fn entry_from_raw_value( value: &RawValue, has_configured_embedder: bool, -) -> std::result::Result, serde_json::Error> { +) -> std::result::Result, RawVectorsError> { let value: RawVectors = RawVectors::from_raw_value(value)?; Ok(match value { @@ -194,12 +219,14 @@ fn entry_from_raw_value( } pub struct VectorDocumentFromVersions<'doc> { + external_document_id: &'doc str, vectors: RawMap<'doc>, embedders: &'doc EmbeddingConfigs, } impl<'doc> VectorDocumentFromVersions<'doc> { pub fn new( + external_document_id: &'doc str, versions: &Versions<'doc>, bump: &'doc Bump, embedders: &'doc EmbeddingConfigs, @@ -208,7 +235,7 @@ impl<'doc> VectorDocumentFromVersions<'doc> { if let Some(vectors_field) = document.vectors_field()? { let vectors = RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; - Ok(Some(Self { vectors, embedders })) + Ok(Some(Self { external_document_id, vectors, embedders })) } else { Ok(None) } @@ -218,16 +245,24 @@ impl<'doc> VectorDocumentFromVersions<'doc> { impl<'doc> VectorDocument<'doc> for VectorDocumentFromVersions<'doc> { fn iter_vectors(&self) -> impl Iterator)>> { self.vectors.iter().map(|(embedder, vectors)| { - let vectors = entry_from_raw_value(vectors, self.embedders.contains(embedder)) - .map_err(UserError::SerdeJson)?; + let vectors = entry_from_raw_value_user( + self.external_document_id, + embedder, + vectors, + self.embedders.contains(embedder), + )?; Ok((embedder, vectors)) }) } fn vectors_for_key(&self, key: &str) -> Result>> { let Some(vectors) = self.vectors.get(key) else { return Ok(None) }; - let vectors = entry_from_raw_value(vectors, self.embedders.contains(key)) - .map_err(UserError::SerdeJson)?; + let vectors = entry_from_raw_value_user( + self.external_document_id, + key, + vectors, + self.embedders.contains(key), + )?; Ok(Some(vectors)) } } @@ -238,8 +273,10 @@ pub struct MergedVectorDocument<'doc> { } impl<'doc> MergedVectorDocument<'doc> { + #[allow(clippy::too_many_arguments)] pub fn with_db( docid: DocumentId, + external_document_id: &'doc str, index: &'doc Index, rtxn: &'doc RoTxn, db_fields_ids_map: &'doc Mapper, @@ -248,16 +285,20 @@ impl<'doc> MergedVectorDocument<'doc> { embedders: &'doc EmbeddingConfigs, ) -> Result> { let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?; - let new_doc = VectorDocumentFromVersions::new(versions, doc_alloc, embedders)?; + let new_doc = + VectorDocumentFromVersions::new(&external_document_id, versions, doc_alloc, embedders)?; Ok(if db.is_none() && new_doc.is_none() { None } else { Some(Self { new_doc, db }) }) } pub fn without_db( + external_document_id: &'doc str, versions: &Versions<'doc>, doc_alloc: &'doc Bump, embedders: &'doc EmbeddingConfigs, ) -> Result> { - let Some(new_doc) = VectorDocumentFromVersions::new(versions, doc_alloc, embedders)? else { + let Some(new_doc) = + VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)? + else { return Ok(None); }; Ok(Some(Self { new_doc: Some(new_doc), db: None })) diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index 5f8b30f1f..a45729abd 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -19,10 +19,54 @@ pub enum RawVectors<'doc> { ImplicitlyUserProvided(#[serde(borrow)] Option<&'doc RawValue>), } +pub enum RawVectorsError { + DeserializeSeq { index: usize, error: String }, + DeserializeKey { error: String }, + DeserializeRegenerate { error: String }, + DeserializeEmbeddings { error: String }, + UnknownField { field: String }, + MissingRegenerate, + WrongKind { kind: &'static str, value: String }, + Parsing(serde_json::Error), +} + +impl RawVectorsError { + pub fn msg(self, embedder_name: &str) -> String { + match self { + RawVectorsError::DeserializeSeq { index, error } => format!( + "Could not parse `._vectors.{embedder_name}[{index}]`: {error}" + ), + RawVectorsError::DeserializeKey { error } => format!( + "Could not parse a field at `._vectors.{embedder_name}`: {error}" + ), + RawVectorsError::DeserializeRegenerate { error } => format!( + "Could not parse `._vectors.{embedder_name}.regenerate`: {error}" + ), + RawVectorsError::DeserializeEmbeddings { error } => format!( + "Could not parse `._vectors.{embedder_name}.embeddings`: {error}" + ), + RawVectorsError::UnknownField { field } => format!( + "Unexpected field `._vectors.{embedder_name}.{field}`\n \ + \t - note: the allowed fields are `regenerate` and `embeddings`" + ), + RawVectorsError::MissingRegenerate => format!( + "Missing field `._vectors.{embedder_name}.regenerate`\n \ + \t - note: `._vectors.{embedder_name}` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`" + ), + RawVectorsError::WrongKind { kind, value } => format!( + "Expected `._vectors.{embedder_name}` to be an array of floats, an array of arrays of floats, or an object with at least the field `regenerate`, but got the {kind} `{value}`" + ), + RawVectorsError::Parsing(error) => format!( + "Could not parse `._vectors.{embedder_name}`: {error}" + ), + } + } +} + impl<'doc> RawVectors<'doc> { - pub fn from_raw_value(raw: &'doc RawValue) -> Result { + pub fn from_raw_value(raw: &'doc RawValue) -> Result { use serde::de::Deserializer as _; - Ok(match raw.deserialize_any(RawVectorsVisitor)? { + Ok(match raw.deserialize_any(RawVectorsVisitor).map_err(RawVectorsError::Parsing)?? { RawVectorsVisitorValue::ImplicitNone => RawVectors::ImplicitlyUserProvided(None), RawVectorsVisitorValue::Implicit => RawVectors::ImplicitlyUserProvided(Some(raw)), RawVectorsVisitorValue::Explicit { regenerate, embeddings } => { @@ -41,7 +85,7 @@ enum RawVectorsVisitorValue<'doc> { } impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor { - type Value = RawVectorsVisitorValue<'doc>; + type Value = std::result::Result, RawVectorsError>; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { write!(formatter, "a map containing at least `regenerate`, or an array of floats`") @@ -51,7 +95,7 @@ impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor { where E: serde::de::Error, { - Ok(RawVectorsVisitorValue::ImplicitNone) + Ok(Ok(RawVectorsVisitorValue::ImplicitNone)) } fn visit_some(self, deserializer: D) -> Result @@ -65,42 +109,150 @@ impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor { where E: serde::de::Error, { - Ok(RawVectorsVisitorValue::ImplicitNone) + Ok(Ok(RawVectorsVisitorValue::ImplicitNone)) } fn visit_seq(self, mut seq: A) -> Result where A: serde::de::SeqAccess<'doc>, { + let mut index = 0; // must consume all elements or parsing fails - while let Some(_) = seq.next_element::<&RawValue>()? {} - Ok(RawVectorsVisitorValue::Implicit) + loop { + match seq.next_element::<&RawValue>() { + Ok(Some(_)) => index += 1, + Err(error) => { + return Ok(Err(RawVectorsError::DeserializeSeq { + index, + error: error.to_string(), + })) + } + Ok(None) => break, + }; + } + Ok(Ok(RawVectorsVisitorValue::Implicit)) } fn visit_map(self, mut map: A) -> Result where A: serde::de::MapAccess<'doc>, { - use serde::de::Error as _; let mut regenerate = None; let mut embeddings = None; - while let Some(s) = map.next_key()? { - match s { - "regenerate" => { - let value: bool = map.next_value()?; + loop { + match map.next_key::<&str>() { + Ok(Some("regenerate")) => { + let value: bool = match map.next_value() { + Ok(value) => value, + Err(error) => { + return Ok(Err(RawVectorsError::DeserializeRegenerate { + error: error.to_string(), + })) + } + }; regenerate = Some(value); } - "embeddings" => { - let value: &RawValue = map.next_value()?; + Ok(Some("embeddings")) => { + let value: &RawValue = match map.next_value() { + Ok(value) => value, + Err(error) => { + return Ok(Err(RawVectorsError::DeserializeEmbeddings { + error: error.to_string(), + })) + } + }; embeddings = Some(value); } - other => return Err(A::Error::unknown_field(other, &["regenerate", "embeddings"])), + Ok(Some(other)) => { + return Ok(Err(RawVectorsError::UnknownField { field: other.to_string() })) + } + Ok(None) => break, + Err(error) => { + return Ok(Err(RawVectorsError::DeserializeKey { error: error.to_string() })) + } } } let Some(regenerate) = regenerate else { - return Err(A::Error::missing_field("regenerate")); + return Ok(Err(RawVectorsError::MissingRegenerate)); }; - Ok(RawVectorsVisitorValue::Explicit { regenerate, embeddings }) + Ok(Ok(RawVectorsVisitorValue::Explicit { regenerate, embeddings })) + } + + fn visit_bool(self, v: bool) -> Result + where + E: serde::de::Error, + { + Ok(Err(RawVectorsError::WrongKind { kind: "boolean", value: v.to_string() })) + } + + fn visit_i64(self, v: i64) -> Result + where + E: serde::de::Error, + { + Ok(Err(RawVectorsError::WrongKind { kind: "integer", value: v.to_string() })) + } + + fn visit_i128(self, v: i128) -> Result + where + E: serde::de::Error, + { + Ok(Err(RawVectorsError::WrongKind { kind: "integer", value: v.to_string() })) + } + + fn visit_u64(self, v: u64) -> Result + where + E: serde::de::Error, + { + Ok(Err(RawVectorsError::WrongKind { kind: "integer", value: v.to_string() })) + } + + fn visit_u128(self, v: u128) -> Result + where + E: serde::de::Error, + { + Ok(Err(RawVectorsError::WrongKind { kind: "integer", value: v.to_string() })) + } + + fn visit_f64(self, v: f64) -> Result + where + E: serde::de::Error, + { + Ok(Err(RawVectorsError::WrongKind { kind: "number", value: v.to_string() })) + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(Err(RawVectorsError::WrongKind { kind: "string", value: v.to_string() })) + } + + fn visit_string(self, v: String) -> Result + where + E: serde::de::Error, + { + Ok(Err(RawVectorsError::WrongKind { kind: "string", value: v })) + } + + fn visit_bytes(self, v: &[u8]) -> Result + where + E: serde::de::Error, + { + Ok(Err(RawVectorsError::WrongKind { kind: "bytes", value: format!("{v:?}") })) + } + + fn visit_newtype_struct(self, deserializer: D) -> Result + where + D: serde::Deserializer<'doc>, + { + deserializer.deserialize_any(self) + } + + fn visit_enum(self, _data: A) -> Result + where + A: serde::de::EnumAccess<'doc>, + { + Ok(Err(RawVectorsError::WrongKind { kind: "enum", value: "a variant".to_string() })) } } From 1d13e804f7909eaf5b87e3541ed65034ca2bbab8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 22:49:40 +0100 Subject: [PATCH 213/247] Adjust test snapshots --- crates/meilisearch/tests/vector/mod.rs | 27 ++++++++++++-------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 47d0c1051..17912fbf5 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -249,7 +249,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`", + "message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -278,7 +278,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`", + "message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -308,7 +308,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.regenerate`: expected a boolean, but found a string: `\"yes please\"`", + "message": "Bad embedder configuration in the document with id: `0`. Could not parse `._vectors.manual.regenerate`: invalid type: string \"yes please\", expected a boolean at line 1 column 26", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -320,8 +320,7 @@ async fn user_provided_embeddings_error() { } "###); - let documents = - json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true }}}); + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true, "regenerate": true }}}); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); let task = index.wait_task(value.uid()).await; @@ -337,7 +336,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings`: expected null or an array, but found a boolean: `true`", + "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings`: expected null or an array, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -349,8 +348,7 @@ async fn user_provided_embeddings_error() { } "###); - let documents = - json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true] }}}); + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true], "regenerate": true }}}); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); let task = index.wait_task(value.uid()).await; @@ -366,7 +364,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", + "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -378,8 +376,7 @@ async fn user_provided_embeddings_error() { } "###); - let documents = - json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]] }}}); + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]], "regenerate": false }}}); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); let task = index.wait_task(value.uid()).await; @@ -395,7 +392,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", + "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -436,7 +433,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", + "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -464,7 +461,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected an array, but found a number: `0.3`", + "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected an array, but found a number: `0.3`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -492,7 +489,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", + "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" From bfdcd1cf3302d9989d5e15edbf47a4bf0b639998 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 22:52:45 +0100 Subject: [PATCH 214/247] Space changes --- crates/milli/src/vector/parsed_vectors.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index a45729abd..da41d1771 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -46,12 +46,12 @@ impl RawVectorsError { "Could not parse `._vectors.{embedder_name}.embeddings`: {error}" ), RawVectorsError::UnknownField { field } => format!( - "Unexpected field `._vectors.{embedder_name}.{field}`\n \ - \t - note: the allowed fields are `regenerate` and `embeddings`" + "Unexpected field `._vectors.{embedder_name}.{field}`\n \ + - note: the allowed fields are `regenerate` and `embeddings`" ), RawVectorsError::MissingRegenerate => format!( - "Missing field `._vectors.{embedder_name}.regenerate`\n \ - \t - note: `._vectors.{embedder_name}` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`" + "Missing field `._vectors.{embedder_name}.regenerate`\n \ + - note: `._vectors.{embedder_name}` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`" ), RawVectorsError::WrongKind { kind, value } => format!( "Expected `._vectors.{embedder_name}` to be an array of floats, an array of arrays of floats, or an object with at least the field `regenerate`, but got the {kind} `{value}`" From 3b0cb5b48738e24543f9c4810b5614597161dc00 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 23:26:16 +0100 Subject: [PATCH 215/247] Fix vector error messages --- .../src/update/new/extract/vectors/mod.rs | 53 +++++++++++++++++++ crates/milli/src/vector/mod.rs | 2 +- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index efb02b2ab..3a73ff82f 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -151,6 +151,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { if new_rendered != old_rendered { chunks.set_autogenerated( update.docid(), + update.external_document_id(), new_rendered, &unused_vectors_distribution, )?; @@ -178,6 +179,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { if new_rendered != old_rendered { chunks.set_autogenerated( update.docid(), + update.external_document_id(), new_rendered, &unused_vectors_distribution, )?; @@ -221,6 +223,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { )?; chunks.set_autogenerated( insertion.docid(), + insertion.external_document_id(), rendered, &unused_vectors_distribution, )?; @@ -233,6 +236,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { )?; chunks.set_autogenerated( insertion.docid(), + insertion.external_document_id(), rendered, &unused_vectors_distribution, )?; @@ -268,6 +272,7 @@ struct Chunks<'a, 'extractor> { user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, sender: &'a EmbeddingSender<'a>, + has_manual_generation: Option<&'a str>, } impl<'a, 'extractor> Chunks<'a, 'extractor> { @@ -297,15 +302,22 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { embedder_id, embedder_name, user_provided, + has_manual_generation: None, } } pub fn set_autogenerated( &mut self, docid: DocumentId, + external_docid: &'a str, rendered: &'a str, unused_vectors_distribution: &UnusedVectorsDistributionBump, ) -> Result<()> { + let is_manual = matches!(&self.embedder, &Embedder::UserProvided(_)); + if is_manual { + self.has_manual_generation.get_or_insert(external_docid); + } + if self.texts.len() < self.texts.capacity() { self.texts.push(rendered); self.ids.push(docid); @@ -322,6 +334,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { unused_vectors_distribution, self.threads, self.sender, + self.has_manual_generation.take(), ) } @@ -339,6 +352,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { unused_vectors_distribution, self.threads, self.sender, + self.has_manual_generation, ); // optimization: don't run bvec dtors as they only contain bumpalo allocated stuff std::mem::forget(self); @@ -356,7 +370,46 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, sender: &EmbeddingSender<'a>, + has_manual_generation: Option<&'a str>, ) -> Result<()> { + if let Some(external_docid) = has_manual_generation { + let mut msg = format!( + r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{}", + external_docid, + if ids.len() > 1 { + format!(" and at least {} other document(s)", ids.len() - 1) + } else { + "".to_string() + } + ); + + msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); + + let mut hint_count = 0; + + for (vector_misspelling, count) in possible_embedding_mistakes.vector_mistakes().take(2) + { + msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); + hint_count += 1; + } + + for (embedder_misspelling, count) in possible_embedding_mistakes + .embedder_mistakes_bump(embedder_name, unused_vectors_distribution) + .take(2) + { + msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); + hint_count += 1; + } + + if hint_count == 0 { + msg += &format!( + "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" + ); + } + + return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))); + } + let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) { Ok(embeddings) => { for (docid, embedding) in ids.into_iter().zip(embeddings) { diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 57da50580..24ea77541 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -648,7 +648,7 @@ impl Embedder { Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(), Embedder::OpenAi(embedder) => embedder.chunk_count_hint(), Embedder::Ollama(embedder) => embedder.chunk_count_hint(), - Embedder::UserProvided(_) => 1, + Embedder::UserProvided(_) => 100, Embedder::Rest(embedder) => embedder.chunk_count_hint(), } } From cb1d6613dded747322f8780093dca475cf187f4a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 23:26:30 +0100 Subject: [PATCH 216/247] Adjust snapshots --- crates/meilisearch/tests/vector/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 17912fbf5..8f4e9cc70 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -529,7 +529,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document \"40\" and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", + "message": "While embedding documents for embedder `manual`: no vectors provided for document `40` and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -558,7 +558,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document \"42\"\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", + "message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -587,7 +587,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document \"42\"\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", + "message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" From 82dcaba6ca71230fb896b3724971944c6a0480ee Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 23:58:25 +0100 Subject: [PATCH 217/247] Fix test: somehow on main vectors where displayed even though retrieveVectors: false --- crates/meilisearch/tests/search/multi.rs | 194 ++++++++++++++++++++--- 1 file changed, 173 insertions(+), 21 deletions(-) diff --git a/crates/meilisearch/tests/search/multi.rs b/crates/meilisearch/tests/search/multi.rs index 932751b49..942a87a79 100644 --- a/crates/meilisearch/tests/search/multi.rs +++ b/crates/meilisearch/tests/search/multi.rs @@ -4346,10 +4346,10 @@ async fn federation_vector_two_indexes() { let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ - {"indexUid" : "vectors-animal", "vector": [1.0, 0.0, 0.5], "hybrid": {"semanticRatio": 1.0, "embedder": "animal"}}, + {"indexUid" : "vectors-animal", "vector": [1.0, 0.0, 0.5], "hybrid": {"semanticRatio": 1.0, "embedder": "animal"}, "retrieveVectors": true}, // joyful and energetic first - {"indexUid": "vectors-sentiment", "vector": [0.8, 0.6], "hybrid": {"semanticRatio": 1.0, "embedder": "sentiment"}}, - {"indexUid": "vectors-sentiment", "q": "dog"}, + {"indexUid": "vectors-sentiment", "vector": [0.8, 0.6], "hybrid": {"semanticRatio": 1.0, "embedder": "sentiment"}, "retrieveVectors": true}, + {"indexUid": "vectors-sentiment", "q": "dog", "retrieveVectors": true}, ]})) .await; snapshot!(code, @"200 OK"); @@ -4364,7 +4364,16 @@ async fn federation_vector_two_indexes() { 0.8, 0.09, 0.8 - ] + ], + "sentiment": { + "embeddings": [ + [ + 0.800000011920929, + 0.30000001192092896 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-sentiment", @@ -4379,7 +4388,17 @@ async fn federation_vector_two_indexes() { "sentiment": [ 0.8, 0.3 - ] + ], + "animal": { + "embeddings": [ + [ + 0.800000011920929, + 0.09000000357627869, + 0.800000011920929 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-animal", @@ -4394,7 +4413,17 @@ async fn federation_vector_two_indexes() { "sentiment": [ -1.0, 0.1 - ] + ], + "animal": { + "embeddings": [ + [ + 0.8500000238418579, + 0.019999999552965164, + 0.10000000149011612 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-animal", @@ -4410,7 +4439,16 @@ async fn federation_vector_two_indexes() { 0.9, 0.8, 0.05 - ] + ], + "sentiment": { + "embeddings": [ + [ + -0.10000000149011612, + 0.550000011920929 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-sentiment", @@ -4426,7 +4464,16 @@ async fn federation_vector_two_indexes() { 0.85, 0.02, 0.1 - ] + ], + "sentiment": { + "embeddings": [ + [ + -1.0, + 0.10000000149011612 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-sentiment", @@ -4441,7 +4488,17 @@ async fn federation_vector_two_indexes() { "sentiment": [ -0.2, 0.65 - ] + ], + "animal": { + "embeddings": [ + [ + 0.800000011920929, + 0.8999999761581421, + 0.5 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-animal", @@ -4456,7 +4513,17 @@ async fn federation_vector_two_indexes() { "sentiment": [ -0.1, 0.55 - ] + ], + "animal": { + "embeddings": [ + [ + 0.8999999761581421, + 0.800000011920929, + 0.05000000074505806 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-animal", @@ -4472,7 +4539,16 @@ async fn federation_vector_two_indexes() { 0.8, 0.9, 0.5 - ] + ], + "sentiment": { + "embeddings": [ + [ + -0.20000000298023224, + 0.6499999761581421 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-sentiment", @@ -4492,8 +4568,8 @@ async fn federation_vector_two_indexes() { // hybrid search, distinct embedder let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ - {"indexUid" : "vectors-animal", "vector": [1.0, 0.0, 0.5], "hybrid": {"semanticRatio": 1.0, "embedder": "animal"}, "showRankingScore": true}, - {"indexUid": "vectors-sentiment", "vector": [-1, 0.6], "q": "beagle", "hybrid": {"semanticRatio": 1.0, "embedder": "sentiment"}, "showRankingScore": true}, + {"indexUid" : "vectors-animal", "vector": [1.0, 0.0, 0.5], "hybrid": {"semanticRatio": 1.0, "embedder": "animal"}, "showRankingScore": true, "retrieveVectors": true}, + {"indexUid": "vectors-sentiment", "vector": [-1, 0.6], "q": "beagle", "hybrid": {"semanticRatio": 1.0, "embedder": "sentiment"}, "showRankingScore": true, "retrieveVectors": true,}, ]})) .await; snapshot!(code, @"200 OK"); @@ -4507,7 +4583,17 @@ async fn federation_vector_two_indexes() { "sentiment": [ 0.8, 0.3 - ] + ], + "animal": { + "embeddings": [ + [ + 0.800000011920929, + 0.09000000357627869, + 0.800000011920929 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-animal", @@ -4523,7 +4609,17 @@ async fn federation_vector_two_indexes() { "sentiment": [ -1.0, 0.1 - ] + ], + "animal": { + "embeddings": [ + [ + 0.8500000238418579, + 0.019999999552965164, + 0.10000000149011612 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-animal", @@ -4540,7 +4636,16 @@ async fn federation_vector_two_indexes() { 0.85, 0.02, 0.1 - ] + ], + "sentiment": { + "embeddings": [ + [ + -1.0, + 0.10000000149011612 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-sentiment", @@ -4557,7 +4662,16 @@ async fn federation_vector_two_indexes() { 0.8, 0.9, 0.5 - ] + ], + "sentiment": { + "embeddings": [ + [ + -0.20000000298023224, + 0.6499999761581421 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-sentiment", @@ -4573,7 +4687,17 @@ async fn federation_vector_two_indexes() { "sentiment": [ -0.2, 0.65 - ] + ], + "animal": { + "embeddings": [ + [ + 0.800000011920929, + 0.8999999761581421, + 0.5 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-animal", @@ -4589,7 +4713,17 @@ async fn federation_vector_two_indexes() { "sentiment": [ -0.1, 0.55 - ] + ], + "animal": { + "embeddings": [ + [ + 0.8999999761581421, + 0.800000011920929, + 0.05000000074505806 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-animal", @@ -4606,7 +4740,16 @@ async fn federation_vector_two_indexes() { 0.9, 0.8, 0.05 - ] + ], + "sentiment": { + "embeddings": [ + [ + -0.10000000149011612, + 0.550000011920929 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-sentiment", @@ -4623,7 +4766,16 @@ async fn federation_vector_two_indexes() { 0.8, 0.09, 0.8 - ] + ], + "sentiment": { + "embeddings": [ + [ + 0.800000011920929, + 0.30000001192092896 + ] + ], + "regenerate": false + } }, "_federation": { "indexUid": "vectors-sentiment", From 7accfea62433b9c0ab8923f2abde47b54ca530c3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 13 Nov 2024 10:33:59 +0100 Subject: [PATCH 218/247] Don't short circuit when we encounter a semantic error while extracting fields and external docid --- crates/milli/src/update/new/indexer/de.rs | 29 +++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/update/new/indexer/de.rs b/crates/milli/src/update/new/indexer/de.rs index 832aadd43..c9808360e 100644 --- a/crates/milli/src/update/new/indexer/de.rs +++ b/crates/milli/src/update/new/indexer/de.rs @@ -41,6 +41,11 @@ impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> Visitor<'de> where A: serde::de::MapAccess<'de>, { + // We need to remember if we encountered a semantic error, because raw values don't like to be parsed partially + // (trying to do so results in parsing errors). + // So we'll exhaust all keys and values even if we encounter an error, and we'll then return any error we detected. + let mut attribute_limit_reached = false; + let mut document_id_extraction_error = None; let mut docid = None; while let Some(((level_name, right), (fid, fields_ids_map))) = @@ -49,20 +54,36 @@ impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> Visitor<'de> visitor: MutFieldIdMapVisitor(self.fields_ids_map), })? { - let Some(_fid) = fid else { - return Ok(Err(crate::UserError::AttributeLimitReached)); - }; self.fields_ids_map = fields_ids_map; let value: &'de RawValue = map.next_value()?; + if attribute_limit_reached || document_id_extraction_error.is_some() { + continue; + } + + let Some(_fid) = fid else { + attribute_limit_reached = true; + continue; + }; match match_component(level_name, right, value, self.indexer, &mut docid) { ControlFlow::Continue(()) => continue, ControlFlow::Break(Err(err)) => return Err(serde::de::Error::custom(err)), - ControlFlow::Break(Ok(err)) => return Ok(Ok(Err(err))), + ControlFlow::Break(Ok(err)) => { + document_id_extraction_error = Some(err); + continue; + } } } + // return previously detected errors + if attribute_limit_reached { + return Ok(Err(UserError::AttributeLimitReached)); + } + if let Some(document_id_extraction_error) = document_id_extraction_error { + return Ok(Ok(Err(document_id_extraction_error))); + } + Ok(Ok(match docid { Some(docid) => Ok(docid), None => Err(DocumentIdExtractionError::MissingDocumentId), From a01bc7b454558fbc9d30ae9ebf3cc53149a37ce6 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 13 Nov 2024 10:34:54 +0100 Subject: [PATCH 219/247] Fix error_document_field_limit_reached_in_one_document test --- .../meilisearch/tests/documents/add_documents.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs index c37b3a5e3..0209a6d57 100644 --- a/crates/meilisearch/tests/documents/add_documents.rs +++ b/crates/meilisearch/tests/documents/add_documents.rs @@ -1335,7 +1335,6 @@ async fn error_add_documents_missing_document_id() { } #[actix_rt::test] -#[should_panic] async fn error_document_field_limit_reached_in_one_document() { let server = Server::new().await; let index = server.index("test"); @@ -1352,7 +1351,7 @@ async fn error_document_field_limit_reached_in_one_document() { let documents = json!([big_object]); let (response, code) = index.update_documents(documents, Some("id")).await; - snapshot!(code, @"500 Internal Server Error"); + snapshot!(code, @"202 Accepted"); let response = index.wait_task(response.uid()).await; snapshot!(code, @"202 Accepted"); @@ -1360,16 +1359,21 @@ async fn error_document_field_limit_reached_in_one_document() { snapshot!(response, @r###" { - "uid": 1, + "uid": "[uid]", "indexUid": "test", - "status": "succeeded", + "status": "failed", "type": "documentAdditionOrUpdate", "canceledBy": null, "details": { "receivedDocuments": 1, - "indexedDocuments": 1 + "indexedDocuments": 0 + }, + "error": { + "message": "A document cannot contain more than 65,535 fields.", + "code": "max_fields_limit_exceeded", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#max_fields_limit_exceeded" }, - "error": null, "duration": "[duration]", "enqueuedAt": "[date]", "startedAt": "[date]", From b17896d899ecce691c9b7e47b35d6e48fc5a7d32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 7 Nov 2024 15:05:20 +0100 Subject: [PATCH 220/247] Finialize the GeoExtractor --- Cargo.lock | 10 + .../tests/documents/add_documents.rs | 4 +- crates/milli/Cargo.toml | 1 + .../src/update/index_documents/typed_chunk.rs | 2 +- crates/milli/src/update/new/channel.rs | 63 +++- crates/milli/src/update/new/document.rs | 6 + .../milli/src/update/new/extract/documents.rs | 8 +- .../new/extract/faceted/extract_facets.rs | 2 +- .../new/extract/faceted/facet_document.rs | 3 +- .../milli/src/update/new/extract/geo/mod.rs | 302 ++++++++++++++++++ crates/milli/src/update/new/extract/mod.rs | 2 + .../extract/searchable/extract_word_docids.rs | 2 +- .../extract_word_pair_proximity_docids.rs | 2 +- .../src/update/new/extract/vectors/mod.rs | 22 +- .../update/new/indexer/document_changes.rs | 4 +- .../update/new/indexer/document_deletion.rs | 2 +- crates/milli/src/update/new/indexer/mod.rs | 62 +++- .../update/new/indexer/update_by_function.rs | 2 +- crates/milli/src/update/new/merger.rs | 91 +++--- 19 files changed, 497 insertions(+), 93 deletions(-) create mode 100644 crates/milli/src/update/new/extract/geo/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 30b1102b5..fd01352a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3664,6 +3664,7 @@ dependencies = [ "time", "tokenizers", "tracing", + "uell", "ureq", "url", "uuid", @@ -5792,6 +5793,15 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9" +[[package]] +name = "uell" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40de5982e28612e20330e77d81f1559b74f66caf3c7fc10b19ada4843f4b4fd7" +dependencies = [ + "bumpalo", +] + [[package]] name = "unescaper" version = "0.1.5" diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs index 0209a6d57..8c9601e0f 100644 --- a/crates/meilisearch/tests/documents/add_documents.rs +++ b/crates/meilisearch/tests/documents/add_documents.rs @@ -2201,7 +2201,7 @@ async fn add_invalid_geo_and_then_settings() { let index = server.index("test"); index.create(Some("id")).await; - // _geo is not an object + // _geo is not a correct object let documents = json!([ { "id": "11", @@ -2230,7 +2230,7 @@ async fn add_invalid_geo_and_then_settings() { } "###); - let (ret, code) = index.update_settings(json!({"sortableAttributes": ["_geo"]})).await; + let (ret, code) = index.update_settings(json!({ "sortableAttributes": ["_geo"] })).await; snapshot!(code, @"202 Accepted"); let ret = index.wait_task(ret.uid()).await; snapshot!(ret, @r###" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 005393411..622292e8a 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -100,6 +100,7 @@ bumpalo = "3.16.0" thread_local = "1.1.8" allocator-api2 = "0.2.18" rustc-hash = "2.0.0" +uell = "0.1.0" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 2c30220bc..a97569800 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -737,7 +737,7 @@ pub(crate) fn write_typed_chunk_into_index( } /// Converts the latitude and longitude back to an xyz GeoPoint. -fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { +pub fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 5b91ae77f..2027b4db8 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -3,9 +3,12 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use heed::types::Bytes; +use memmap2::Mmap; +use roaring::RoaringBitmap; use super::extract::FacetKind; use super::StdResult; +use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; use crate::index::IndexEmbeddingConfig; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; @@ -25,9 +28,9 @@ pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) ) } -pub struct KeyValueEntry { - pub key_length: usize, - pub data: Box<[u8]>, +pub enum KeyValueEntry { + Small { key_length: usize, data: Box<[u8]> }, + Large { key_entry: KeyEntry, data: Mmap }, } impl KeyValueEntry { @@ -35,14 +38,25 @@ impl KeyValueEntry { let mut data = Vec::with_capacity(key.len() + value.len()); data.extend_from_slice(key); data.extend_from_slice(value); - KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() } + KeyValueEntry::Small { key_length: key.len(), data: data.into_boxed_slice() } } + + fn from_large_key_value(key: &[u8], value: Mmap) -> Self { + KeyValueEntry::Large { key_entry: KeyEntry::from_key(key), data: value } + } + pub fn key(&self) -> &[u8] { - &self.data[..self.key_length] + match self { + KeyValueEntry::Small { key_length, data } => &data[..*key_length], + KeyValueEntry::Large { key_entry, data: _ } => key_entry.entry(), + } } pub fn value(&self) -> &[u8] { - &self.data[self.key_length..] + match self { + KeyValueEntry::Small { key_length, data } => &data[*key_length..], + KeyValueEntry::Large { key_entry: _, data } => &data[..], + } } } @@ -97,6 +111,7 @@ pub struct DbOperation { #[derive(Debug)] pub enum Database { + Main, Documents, ExternalDocumentsIds, ExactWordDocids, @@ -115,6 +130,7 @@ pub enum Database { impl Database { pub fn database(&self, index: &Index) -> heed::Database { match self { + Database::Main => index.main.remap_types(), Database::Documents => index.documents.remap_types(), Database::ExternalDocumentsIds => index.external_documents_ids.remap_types(), Database::ExactWordDocids => index.exact_word_docids.remap_types(), @@ -207,6 +223,10 @@ impl ExtractorSender { EmbeddingSender(&self.sender) } + pub fn geo(&self) -> GeoSender<'_> { + GeoSender(&self.sender) + } + fn send_delete_vector(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { match self .sender @@ -423,3 +443,34 @@ impl EmbeddingSender<'_> { .map_err(|_| SendError(())) } } + +pub struct GeoSender<'a>(&'a Sender); + +impl GeoSender<'_> { + pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> { + self.0 + .send(WriterOperation::DbOperation(DbOperation { + database: Database::Main, + entry: EntryOperation::Write(KeyValueEntry::from_large_key_value( + GEO_RTREE_KEY.as_bytes(), + value, + )), + })) + .map_err(|_| SendError(())) + } + + pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> StdResult<(), SendError<()>> { + let mut buffer = Vec::new(); + bitmap.serialize_into(&mut buffer).unwrap(); + + self.0 + .send(WriterOperation::DbOperation(DbOperation { + database: Database::Main, + entry: EntryOperation::Write(KeyValueEntry::from_small_key_value( + GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(), + &buffer, + )), + })) + .map_err(|_| SendError(())) + } +} diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index 692277597..8d4e3b0a9 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -352,6 +352,11 @@ where unordered_field_buffer.push((vectors_fid, &vectors_value)); } + if let Some(geo_value) = document.geo_field()? { + let fid = fields_ids_map.id_or_insert("_geo").ok_or(UserError::AttributeLimitReached)?; + unordered_field_buffer.push((fid, geo_value)); + } + unordered_field_buffer.sort_by_key(|(fid, _)| *fid); for (fid, value) in unordered_field_buffer.iter() { writer.insert(*fid, value.get().as_bytes()).unwrap(); @@ -406,6 +411,7 @@ impl<'doc> Versions<'doc> { pub fn is_empty(&self) -> bool { self.data.is_empty() } + pub fn top_level_field(&self, k: &str) -> Option<&'doc RawValue> { if k == RESERVED_VECTORS_FIELD_NAME || k == "_geo" { return None; diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 2c93a5def..b76fe207a 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -54,7 +54,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { DocumentChange::Deletion(deletion) => { let docid = deletion.docid(); let content = deletion.current( - &context.txn, + &context.rtxn, context.index, &context.db_fields_ids_map, )?; @@ -72,7 +72,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { DocumentChange::Update(update) => { let docid = update.docid(); let content = - update.current(&context.txn, context.index, &context.db_fields_ids_map)?; + update.current(&context.rtxn, context.index, &context.db_fields_ids_map)?; for res in content.iter_top_level_fields() { let (f, _) = res?; let entry = document_extractor_data @@ -92,9 +92,9 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { } let content = - update.merged(&context.txn, context.index, &context.db_fields_ids_map)?; + update.merged(&context.rtxn, context.index, &context.db_fields_ids_map)?; let vector_content = update.merged_vectors( - &context.txn, + &context.rtxn, context.index, &context.db_fields_ids_map, &context.doc_alloc, diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 11dc8f3c7..d0dc425ae 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -63,7 +63,7 @@ impl FacetedDocidsExtractor { document_change: DocumentChange, ) -> Result<()> { let index = &context.index; - let rtxn = &context.txn; + let rtxn = &context.rtxn; let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut cached_sorter = context.data.borrow_mut_or_yield(); match document_change { diff --git a/crates/milli/src/update/new/extract/faceted/facet_document.rs b/crates/milli/src/update/new/extract/faceted/facet_document.rs index cf8984f9c..4308d0aa5 100644 --- a/crates/milli/src/update/new/extract/faceted/facet_document.rs +++ b/crates/milli/src/update/new/extract/faceted/facet_document.rs @@ -10,7 +10,8 @@ pub fn extract_document_facets<'doc>( field_id_map: &mut GlobalFieldsIdsMap, facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>, ) -> Result<()> { - for res in document.iter_top_level_fields() { + let geo = document.geo_field().transpose().map(|res| res.map(|rval| ("_geo", rval))); + for res in document.iter_top_level_fields().chain(geo) { let (field_name, value) = res?; let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) { diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs new file mode 100644 index 000000000..180611eee --- /dev/null +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -0,0 +1,302 @@ +use std::cell::RefCell; +use std::fs::File; +use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _}; +use std::{iter, mem, result}; + +use bumpalo::Bump; +use bytemuck::{bytes_of, from_bytes, pod_read_unaligned, Pod, Zeroable}; +use heed::RoTxn; +use serde_json::value::RawValue; +use serde_json::Value; + +use crate::error::GeoError; +use crate::update::new::document::Document; +use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor, MostlySend}; +use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::DocumentChange; +use crate::update::GrenadParameters; +use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Object, Result}; + +pub struct GeoExtractor { + grenad_parameters: GrenadParameters, +} + +impl GeoExtractor { + pub fn new( + rtxn: &RoTxn, + index: &Index, + grenad_parameters: GrenadParameters, + ) -> Result> { + let is_sortable = index.sortable_fields(rtxn)?.contains("_geo"); + let is_filterable = index.filterable_fields(rtxn)?.contains("_geo"); + if is_sortable || is_filterable { + Ok(Some(GeoExtractor { grenad_parameters })) + } else { + Ok(None) + } + } +} + +#[derive(Pod, Zeroable, Copy, Clone)] +#[repr(C, packed)] +pub struct ExtractedGeoPoint { + pub docid: DocumentId, + pub lat_lng: [f64; 2], +} + +impl From for GeoPoint { + /// Converts the latitude and longitude back to an xyz GeoPoint. + fn from(value: ExtractedGeoPoint) -> Self { + let [lat, lng] = value.lat_lng; + let point = [lat, lng]; + let xyz_point = lat_lng_to_xyz(&point); + GeoPoint::new(xyz_point, (value.docid, point)) + } +} + +pub struct GeoExtractorData<'extractor> { + /// The set of documents ids that were removed. If a document sees its geo + /// point being updated, we first put it in the deleted and then in the inserted. + removed: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>, + inserted: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>, + /// TODO Do the doc + spilled_removed: Option>, + /// TODO Do the doc + spilled_inserted: Option>, +} + +impl<'extractor> GeoExtractorData<'extractor> { + pub fn freeze(self) -> Result> { + let GeoExtractorData { removed, inserted, spilled_removed, spilled_inserted } = self; + + Ok(FrozenGeoExtractorData { + removed: removed.into_bump_slice(), + inserted: inserted.into_bump_slice(), + spilled_removed: spilled_removed + .map(|bw| bw.into_inner().map(BufReader::new).map_err(|iie| iie.into_error())) + .transpose()?, + spilled_inserted: spilled_inserted + .map(|bw| bw.into_inner().map(BufReader::new).map_err(|iie| iie.into_error())) + .transpose()?, + }) + } +} + +unsafe impl MostlySend for GeoExtractorData<'_> {} + +pub struct FrozenGeoExtractorData<'extractor> { + pub removed: &'extractor [ExtractedGeoPoint], + pub inserted: &'extractor [ExtractedGeoPoint], + pub spilled_removed: Option>, + pub spilled_inserted: Option>, +} + +impl<'extractor> FrozenGeoExtractorData<'extractor> { + pub fn iter_and_clear_removed( + &mut self, + ) -> impl IntoIterator> + '_ { + mem::take(&mut self.removed) + .iter() + .copied() + .map(Ok) + .chain(iterator_over_spilled_geopoints(&mut self.spilled_removed)) + } + + pub fn iter_and_clear_inserted( + &mut self, + ) -> impl IntoIterator> + '_ { + mem::take(&mut self.inserted) + .iter() + .copied() + .map(Ok) + .chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted)) + } +} + +fn iterator_over_spilled_geopoints( + spilled: &mut Option>, +) -> impl IntoIterator> + '_ { + let mut spilled = spilled.take(); + iter::from_fn(move || match &mut spilled { + Some(file) => { + let geopoint_bytes = &mut [0u8; mem::size_of::()]; + match file.read_exact(geopoint_bytes) { + Ok(()) => Some(Ok(pod_read_unaligned(geopoint_bytes))), + Err(e) if e.kind() == ErrorKind::UnexpectedEof => None, + Err(e) => Some(Err(e)), + } + } + None => None, + }) +} + +impl<'extractor> Extractor<'extractor> for GeoExtractor { + type Data = RefCell>; + + fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> Result { + Ok(RefCell::new(GeoExtractorData { + removed: bumpalo::collections::Vec::new_in(extractor_alloc), + // inserted: Uell::new_in(extractor_alloc), + inserted: bumpalo::collections::Vec::new_in(extractor_alloc), + spilled_inserted: None, + spilled_removed: None, + })) + } + + fn process<'doc>( + &'doc self, + changes: impl Iterator>>, + context: &'doc DocumentChangeContext, + ) -> Result<()> { + let rtxn = &context.rtxn; + let index = context.index; + let max_memory = self.grenad_parameters.max_memory; + let db_fields_ids_map = context.db_fields_ids_map; + let mut data_ref = context.data.borrow_mut_or_yield(); + + for change in changes { + if max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm) { + // We must spill as we allocated too much memory + data_ref.spilled_removed = tempfile::tempfile().map(BufWriter::new).map(Some)?; + data_ref.spilled_inserted = tempfile::tempfile().map(BufWriter::new).map(Some)?; + } + + match change? { + DocumentChange::Deletion(deletion) => { + let docid = deletion.docid(); + let external_id = deletion.external_document_id(); + let current = deletion.current(rtxn, index, db_fields_ids_map)?; + let current_geo = current + .geo_field()? + .map(|geo| extract_geo_coordinates(external_id, geo)) + .transpose()?; + + if let Some(lat_lng) = current_geo.flatten() { + let geopoint = ExtractedGeoPoint { docid, lat_lng }; + match &mut data_ref.spilled_removed { + Some(file) => file.write_all(bytes_of(&geopoint))?, + None => data_ref.removed.push(geopoint), + } + } + } + DocumentChange::Update(update) => { + let current = update.current(rtxn, index, db_fields_ids_map)?; + let external_id = update.external_document_id(); + let docid = update.docid(); + + let current_geo = current + .geo_field()? + .map(|geo| extract_geo_coordinates(external_id, geo)) + .transpose()?; + + let updated_geo = update + .updated() + .geo_field()? + .map(|geo| extract_geo_coordinates(external_id, geo)) + .transpose()?; + + if current_geo != updated_geo { + // If the current and new geo points are different it means that + // we need to replace the current by the new point and therefore + // delete the current point from the RTree. + if let Some(lat_lng) = current_geo.flatten() { + let geopoint = ExtractedGeoPoint { docid, lat_lng }; + match &mut data_ref.spilled_removed { + Some(file) => file.write_all(bytes_of(&geopoint))?, + None => data_ref.removed.push(geopoint), + } + } + + if let Some(lat_lng) = updated_geo.flatten() { + let geopoint = ExtractedGeoPoint { docid, lat_lng }; + match &mut data_ref.spilled_inserted { + Some(file) => file.write_all(bytes_of(&geopoint))?, + None => data_ref.inserted.push(geopoint), + } + } + } + } + DocumentChange::Insertion(insertion) => { + let external_id = insertion.external_document_id(); + let docid = insertion.docid(); + + let inserted_geo = insertion + .inserted() + .geo_field()? + .map(|geo| extract_geo_coordinates(external_id, geo)) + .transpose()?; + + if let Some(lat_lng) = inserted_geo.flatten() { + let geopoint = ExtractedGeoPoint { docid, lat_lng }; + match &mut data_ref.spilled_inserted { + Some(file) => file.write_all(bytes_of(&geopoint))?, + None => data_ref.inserted.push(geopoint), + } + } + } + } + } + + Ok(()) + } +} + +/// Extracts and validate the latitude and latitude from a document geo field. +/// +/// It can be of the form `{ "lat": 0.0, "lng": "1.0" }`. +fn extract_geo_coordinates(external_id: &str, raw_value: &RawValue) -> Result> { + let mut geo = match serde_json::from_str(raw_value.get()).map_err(InternalError::SerdeJson)? { + Value::Null => return Ok(None), + Value::Object(map) => map, + value => { + return Err( + GeoError::NotAnObject { document_id: Value::from(external_id), value }.into() + ) + } + }; + + let [lat, lng] = match (geo.remove("lat"), geo.remove("lng")) { + (Some(lat), Some(lng)) => [lat, lng], + (Some(_), None) => { + return Err(GeoError::MissingLatitude { document_id: Value::from(external_id) }.into()) + } + (None, Some(_)) => { + return Err(GeoError::MissingLongitude { document_id: Value::from(external_id) }.into()) + } + (None, None) => { + return Err(GeoError::MissingLatitudeAndLongitude { + document_id: Value::from(external_id), + } + .into()) + } + }; + + let lat = extract_finite_float_from_value(lat) + .map_err(|value| GeoError::BadLatitude { document_id: Value::from(external_id), value })?; + + let lng = extract_finite_float_from_value(lng) + .map_err(|value| GeoError::BadLongitude { document_id: Value::from(external_id), value })?; + + Ok(Some([lat, lng])) +} + +/// Extracts and validate that a serde JSON Value is actually a finite f64. +pub fn extract_finite_float_from_value(value: Value) -> result::Result { + let number = match value { + Value::Number(ref n) => match n.as_f64() { + Some(number) => number, + None => return Err(value), + }, + Value::String(ref s) => match s.parse::() { + Ok(number) => number, + Err(_) => return Err(value), + }, + value => return Err(value), + }; + + if number.is_finite() { + Ok(number) + } else { + Err(value) + } +} diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index af6a29d07..14cfa83cb 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -1,6 +1,7 @@ mod cache; mod documents; mod faceted; +mod geo; mod searchable; mod vectors; @@ -8,6 +9,7 @@ use bumpalo::Bump; pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap}; pub use documents::*; pub use faceted::*; +pub use geo::*; pub use searchable::*; pub use vectors::EmbeddingExtractor; diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 89583bd93..0223895e6 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -326,7 +326,7 @@ impl WordDocidsExtractors { document_change: DocumentChange, ) -> Result<()> { let index = &context.index; - let rtxn = &context.txn; + let rtxn = &context.rtxn; let mut cached_sorter_ref = context.data.borrow_mut_or_yield(); let cached_sorter = cached_sorter_ref.as_mut().unwrap(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 7f9fff38f..f637cff49 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -39,7 +39,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { let doc_alloc = &context.doc_alloc; let index = context.index; - let rtxn = &context.txn; + let rtxn = &context.rtxn; let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc); let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc); diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 3a73ff82f..2fb717c71 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -2,13 +2,13 @@ use std::cell::RefCell; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; -use hashbrown::HashMap; +use hashbrown::{DefaultHashBuilder, HashMap}; use super::cache::DelAddRoaringBitmap; use crate::error::FaultSource; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; -use crate::update::new::indexer::document_changes::{Extractor, MostlySend}; +use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor, MostlySend}; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; use crate::vector::error::{ @@ -37,7 +37,7 @@ impl<'a> EmbeddingExtractor<'a> { } pub struct EmbeddingExtractorData<'extractor>( - pub HashMap, + pub HashMap, ); unsafe impl MostlySend for EmbeddingExtractorData<'_> {} @@ -52,9 +52,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { fn process<'doc>( &'doc self, changes: impl Iterator>>, - context: &'doc crate::update::new::indexer::document_changes::DocumentChangeContext< - Self::Data, - >, + context: &'doc DocumentChangeContext, ) -> crate::Result<()> { let embedders = self.embedders.inner_as_ref(); let mut unused_vectors_distribution = @@ -63,7 +61,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { let embedder_id = - context.index.embedder_category_id.get(&context.txn, embedder_name)?.ok_or_else( + context.index.embedder_category_id.get(&context.rtxn, embedder_name)?.ok_or_else( || InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None, @@ -95,7 +93,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { } DocumentChange::Update(update) => { let old_vectors = update.current_vectors( - &context.txn, + &context.rtxn, context.index, context.db_fields_ids_map, &context.doc_alloc, @@ -132,7 +130,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { } else if new_vectors.regenerate { let new_rendered = prompt.render_document( update.current( - &context.txn, + &context.rtxn, context.index, context.db_fields_ids_map, )?, @@ -141,7 +139,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { )?; let old_rendered = prompt.render_document( update.merged( - &context.txn, + &context.rtxn, context.index, context.db_fields_ids_map, )?, @@ -160,7 +158,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { } else if old_vectors.regenerate { let old_rendered = prompt.render_document( update.current( - &context.txn, + &context.rtxn, context.index, context.db_fields_ids_map, )?, @@ -169,7 +167,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { )?; let new_rendered = prompt.render_document( update.merged( - &context.txn, + &context.rtxn, context.index, context.db_fields_ids_map, )?, diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index b9bf79e47..e4b088f31 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -197,7 +197,7 @@ pub struct DocumentChangeContext< /// inside of the DB. pub db_fields_ids_map: &'indexer FieldsIdsMap, /// A transaction providing data from the DB before all indexing operations - pub txn: RoTxn<'indexer>, + pub rtxn: RoTxn<'indexer>, /// Global field id map that is up to date with the current state of the indexing process. /// @@ -255,7 +255,7 @@ impl< let txn = index.read_txn()?; Ok(DocumentChangeContext { index, - txn, + rtxn: txn, db_fields_ids_map, new_fields_ids_map: fields_ids_map, doc_alloc, diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index d7648acd8..e89b04223 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -63,7 +63,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> { where 'pl: 'doc, // the payload must survive the process calls { - let current = context.index.document(&context.txn, *docid)?; + let current = context.index.document(&context.rtxn, *docid)?; let external_document_id = self.primary_key.extract_docid_from_db( current, diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 6d1d0eea8..e3b24642e 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -33,6 +33,7 @@ use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; use crate::proximity::ProximityPrecision; use crate::update::del_add::DelAdd; use crate::update::new::extract::EmbeddingExtractor; +use crate::update::new::merger::merge_and_send_rtree; use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::InnerIndexSettings; @@ -57,6 +58,7 @@ mod steps { "extracting words", "extracting word proximity", "extracting embeddings", + "writing geo points", "writing to database", "writing embeddings to database", "waiting for extractors", @@ -93,29 +95,33 @@ mod steps { step(4) } - pub const fn write_db() -> (u16, &'static str) { + pub const fn extract_geo_points() -> (u16, &'static str) { step(5) } - pub const fn write_embedding_db() -> (u16, &'static str) { + pub const fn write_db() -> (u16, &'static str) { step(6) } - pub const fn waiting_extractors() -> (u16, &'static str) { + pub const fn write_embedding_db() -> (u16, &'static str) { step(7) } - pub const fn post_processing_facets() -> (u16, &'static str) { + pub const fn waiting_extractors() -> (u16, &'static str) { step(8) } - pub const fn post_processing_words() -> (u16, &'static str) { + pub const fn post_processing_facets() -> (u16, &'static str) { step(9) } - pub const fn finalizing() -> (u16, &'static str) { + pub const fn post_processing_words() -> (u16, &'static str) { step(10) } + + pub const fn finalizing() -> (u16, &'static str) { + step(11) + } } /// This is the main function of this crate. @@ -144,11 +150,8 @@ where let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; - let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); - let new_fields_ids_map = RwLock::new(new_fields_ids_map); - let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads()); let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); @@ -328,7 +331,15 @@ where let (finished_steps, step_name) = steps::extract_word_proximity(); - let caches = ::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?; + let caches = ::run_extraction(grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + finished_steps, + total_steps, + step_name, + )?; + merge_and_send_docids( caches, index.word_pair_proximity_docids.remap_types(), @@ -351,8 +362,6 @@ where let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); let mut datastore = ThreadLocal::with_capacity(pool.current_num_threads()); let (finished_steps, step_name) = steps::extract_embeddings(); - - extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; for config in &mut index_embeddings { @@ -366,6 +375,35 @@ where embedding_sender.finish(index_embeddings).unwrap(); } + 'geo: { + let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); + let _entered = span.enter(); + + // let geo_sender = extractor_sender.geo_points(); + let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { + break 'geo; + }; + let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); + let (finished_steps, step_name) = steps::extract_geo_points(); + extract(document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + finished_steps, + total_steps, + step_name, + )?; + + merge_and_send_rtree( + datastore, + &rtxn, + index, + extractor_sender.geo(), + &indexing_context.must_stop_processing, + )?; + } + // TODO THIS IS TOO MUCH // - [ ] Extract fieldid docid facet number // - [ ] Extract fieldid docid facet string diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index eb7252445..f6df3981d 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -93,7 +93,7 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { let DocumentChangeContext { index, db_fields_ids_map, - txn, + rtxn: txn, new_fields_ids_map, doc_alloc, .. diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 4eca113ea..c81f84f43 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -1,68 +1,63 @@ -use std::io::{self}; +use std::cell::RefCell; +use std::io; -use bincode::ErrorKind; use hashbrown::HashSet; use heed::types::Bytes; use heed::{Database, RoTxn}; +use memmap2::Mmap; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; use super::channel::*; use super::extract::{ merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind, + GeoExtractorData, }; -use super::DocumentChange; -use crate::{ - CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, InternalError, - Result, -}; +use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result}; -pub struct GeoExtractor { - rtree: Option>, -} +#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] +pub fn merge_and_send_rtree<'extractor, MSP>( + datastore: impl IntoIterator>>, + rtxn: &RoTxn, + index: &Index, + geo_sender: GeoSender<'_>, + must_stop_processing: &MSP, +) -> Result<()> +where + MSP: Fn() -> bool + Sync, +{ + let mut rtree = index.geo_rtree(rtxn)?.unwrap_or_default(); + let mut faceted = index.geo_faceted_documents_ids(rtxn)?; -impl GeoExtractor { - pub fn new(rtxn: &RoTxn, index: &Index) -> Result> { - let is_sortable = index.sortable_fields(rtxn)?.contains("_geo"); - let is_filterable = index.filterable_fields(rtxn)?.contains("_geo"); - if is_sortable || is_filterable { - Ok(Some(GeoExtractor { rtree: index.geo_rtree(rtxn)? })) - } else { - Ok(None) + for data in datastore { + if must_stop_processing() { + return Err(InternalError::AbortedIndexation.into()); + } + + let mut frozen = data.into_inner().freeze()?; + for result in frozen.iter_and_clear_removed() { + let extracted_geo_point = result?; + debug_assert!(rtree.remove(&GeoPoint::from(extracted_geo_point)).is_some()); + debug_assert!(faceted.remove(extracted_geo_point.docid)); + } + + for result in frozen.iter_and_clear_inserted() { + let extracted_geo_point = result?; + rtree.insert(GeoPoint::from(extracted_geo_point)); + debug_assert!(faceted.insert(extracted_geo_point.docid)); } } - pub fn manage_change( - &mut self, - fidmap: &mut GlobalFieldsIdsMap, - change: &DocumentChange, - ) -> Result<()> { - match change { - DocumentChange::Deletion(_) => todo!(), - DocumentChange::Update(_) => todo!(), - DocumentChange::Insertion(_) => todo!(), - } - } + let mut file = tempfile::tempfile()?; + /// manage error + bincode::serialize_into(&mut file, dbg!(&rtree)).unwrap(); + file.sync_all()?; - pub fn serialize_rtree(self, writer: &mut W) -> Result { - match self.rtree { - Some(rtree) => { - // TODO What should I do? - bincode::serialize_into(writer, &rtree).map(|_| true).map_err(|e| match *e { - ErrorKind::Io(e) => Error::IoError(e), - ErrorKind::InvalidUtf8Encoding(_) => todo!(), - ErrorKind::InvalidBoolEncoding(_) => todo!(), - ErrorKind::InvalidCharEncoding => todo!(), - ErrorKind::InvalidTagEncoding(_) => todo!(), - ErrorKind::DeserializeAnyNotSupported => todo!(), - ErrorKind::SizeLimit => todo!(), - ErrorKind::SequenceMustHaveLength => todo!(), - ErrorKind::Custom(_) => todo!(), - }) - } - None => Ok(false), - } - } + let rtree_mmap = unsafe { Mmap::map(&file)? }; + geo_sender.set_rtree(rtree_mmap).unwrap(); + geo_sender.set_geo_faceted(&faceted).unwrap(); + + Ok(()) } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] From 51b6293738168e620f127daea0159311d30bea76 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 13 Nov 2024 11:34:49 +0100 Subject: [PATCH 221/247] Add linear facet databases --- .../src/heed_codec/facet/ordered_f64_codec.rs | 29 +++- crates/milli/src/update/new/channel.rs | 40 +++++ .../new/extract/faceted/extract_facets.rs | 154 ++++++++++++++++-- crates/milli/src/update/new/indexer/mod.rs | 2 +- 4 files changed, 202 insertions(+), 23 deletions(-) diff --git a/crates/milli/src/heed_codec/facet/ordered_f64_codec.rs b/crates/milli/src/heed_codec/facet/ordered_f64_codec.rs index 4eccdb68b..19ba7a460 100644 --- a/crates/milli/src/heed_codec/facet/ordered_f64_codec.rs +++ b/crates/milli/src/heed_codec/facet/ordered_f64_codec.rs @@ -27,17 +27,34 @@ impl heed::BytesEncode<'_> for OrderedF64Codec { fn bytes_encode(f: &Self::EItem) -> Result, BoxedError> { let mut buffer = [0u8; 16]; - // write the globally ordered float - let bytes = f64_into_bytes(*f).ok_or(InvalidGloballyOrderedFloatError { float: *f })?; - buffer[..8].copy_from_slice(&bytes[..]); - // Then the f64 value just to be able to read it back - let bytes = f.to_be_bytes(); - buffer[8..16].copy_from_slice(&bytes[..]); + encode_f64_into_ordered_bytes(*f, &mut buffer)?; Ok(Cow::Owned(buffer.to_vec())) } } +impl OrderedF64Codec { + pub fn serialize_into( + f: f64, + buffer: &mut [u8; 16], + ) -> Result<(), InvalidGloballyOrderedFloatError> { + encode_f64_into_ordered_bytes(f, buffer) + } +} + +fn encode_f64_into_ordered_bytes( + f: f64, + buffer: &mut [u8; 16], +) -> Result<(), InvalidGloballyOrderedFloatError> { + let bytes = f64_into_bytes(f).ok_or(InvalidGloballyOrderedFloatError { float: f })?; + buffer[..8].copy_from_slice(&bytes[..]); + // Then the f64 value just to be able to read it back + let bytes = f.to_be_bytes(); + buffer[8..16].copy_from_slice(&bytes[..]); + + Ok(()) +} + #[derive(Error, Debug)] #[error("the float {float} cannot be converted to a globally ordered representation")] pub struct InvalidGloballyOrderedFloatError { diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 2027b4db8..3287a1f7f 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -3,11 +3,13 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use heed::types::Bytes; +use heed::BytesDecode; use memmap2::Mmap; use roaring::RoaringBitmap; use super::extract::FacetKind; use super::StdResult; +use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; use crate::index::IndexEmbeddingConfig; use crate::update::new::KvReaderFieldId; @@ -125,6 +127,8 @@ pub enum Database { FacetIdExistsDocids, FacetIdF64NumberDocids, FacetIdStringDocids, + FieldIdDocidFacetStrings, + FieldIdDocidFacetF64s, } impl Database { @@ -144,6 +148,8 @@ impl Database { Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(), Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(), Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), + Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(), + Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), } } } @@ -215,6 +221,10 @@ impl ExtractorSender { FacetDocidsSender { sender: self } } + pub fn field_id_docid_facet_sender(&self) -> FieldIdDocidFacetSender<'_> { + FieldIdDocidFacetSender(self) + } + pub fn documents(&self) -> DocumentsSender<'_> { DocumentsSender(self) } @@ -351,6 +361,36 @@ impl DocidsSender for FacetDocidsSender<'_> { } } +pub struct FieldIdDocidFacetSender<'a>(&'a ExtractorSender); + +impl FieldIdDocidFacetSender<'_> { + pub fn write_facet_string(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(&key, &[])); + self.0 + .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) + } + + pub fn write_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(&key, &[])); + self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) + } + + pub fn delete_facet_string(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); + let entry = EntryOperation::Delete(KeyEntry::from_key(key)); + self.0 + .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) + } + + pub fn delete_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); + let entry = EntryOperation::Delete(KeyEntry::from_key(key)); + self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) + } +} + pub struct DocumentsSender<'a>(&'a ExtractorSender); impl DocumentsSender<'_> { diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index d0dc425ae..0e7dcc4b9 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -1,16 +1,21 @@ use std::cell::RefCell; use std::collections::HashSet; +use std::mem::size_of; use std::ops::DerefMut as _; +use bumpalo::collections::Vec as BVec; use bumpalo::Bump; -use heed::RoTxn; +use hashbrown::HashMap; +use heed::{BytesDecode, RoTxn}; use serde_json::Value; use super::super::cache::BalancedCaches; use super::facet_document::extract_document_facets; use super::FacetKind; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::new::extract::DocidsExtractor; +use crate::heed_codec::facet::OrderedF64Codec; +use crate::update::del_add::DelAdd; +use crate::update::new::channel::FieldIdDocidFacetSender; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, Progress, ThreadLocal, @@ -22,6 +27,7 @@ use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; pub struct FacetedExtractorData<'a> { attributes_to_extract: &'a [&'a str], + sender: &'a FieldIdDocidFacetSender<'a>, grenad_parameters: GrenadParameters, buckets: usize, } @@ -48,6 +54,7 @@ impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> { context, self.attributes_to_extract, change, + self.sender, )? } Ok(()) @@ -61,12 +68,15 @@ impl FacetedDocidsExtractor { context: &DocumentChangeContext>, attributes_to_extract: &[&str], document_change: DocumentChange, + sender: &FieldIdDocidFacetSender, ) -> Result<()> { let index = &context.index; let rtxn = &context.rtxn; let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut cached_sorter = context.data.borrow_mut_or_yield(); - match document_change { + let mut del_add_facet_value = DelAddFacetValue::new(&context.doc_alloc); + let docid = document_change.docid(); + let res = match document_change { DocumentChange::Deletion(inner) => extract_document_facets( attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, @@ -76,7 +86,9 @@ impl FacetedDocidsExtractor { &context.doc_alloc, cached_sorter.deref_mut(), BalancedCaches::insert_del_u32, - inner.docid(), + &mut del_add_facet_value, + DelAddFacetValue::insert_del, + docid, fid, value, ) @@ -92,7 +104,9 @@ impl FacetedDocidsExtractor { &context.doc_alloc, cached_sorter.deref_mut(), BalancedCaches::insert_del_u32, - inner.docid(), + &mut del_add_facet_value, + DelAddFacetValue::insert_del, + docid, fid, value, ) @@ -108,7 +122,9 @@ impl FacetedDocidsExtractor { &context.doc_alloc, cached_sorter.deref_mut(), BalancedCaches::insert_add_u32, - inner.docid(), + &mut del_add_facet_value, + DelAddFacetValue::insert_add, + docid, fid, value, ) @@ -124,24 +140,31 @@ impl FacetedDocidsExtractor { &context.doc_alloc, cached_sorter.deref_mut(), BalancedCaches::insert_add_u32, - inner.docid(), + &mut del_add_facet_value, + DelAddFacetValue::insert_add, + docid, fid, value, ) }, ), - } + }; + + del_add_facet_value.send_data(docid, sender, &context.doc_alloc).unwrap(); + res } - fn facet_fn_with_options<'extractor>( - doc_alloc: &Bump, + fn facet_fn_with_options<'extractor, 'doc>( + doc_alloc: &'doc Bump, cached_sorter: &mut BalancedCaches<'extractor>, cache_fn: impl Fn(&mut BalancedCaches<'extractor>, &[u8], u32) -> Result<()>, + del_add_facet_value: &mut DelAddFacetValue<'doc>, + facet_fn: impl Fn(&mut DelAddFacetValue<'doc>, FieldId, BVec<'doc, u8>, FacetKind), docid: DocumentId, fid: FieldId, value: &Value, ) -> Result<()> { - let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); + let mut buffer = BVec::new_in(doc_alloc); // Exists // key: fid buffer.push(FacetKind::Exists as u8); @@ -152,15 +175,21 @@ impl FacetedDocidsExtractor { // Number // key: fid - level - orderedf64 - orignalf64 Value::Number(number) => { - if let Some((n, ordered)) = - number.as_f64().and_then(|n| f64_into_bytes(n).map(|ordered| (n, ordered))) + let mut ordered = [0u8; 16]; + if number + .as_f64() + .and_then(|n| OrderedF64Codec::serialize_into(n, &mut ordered).ok()) + .is_some() { + let mut number = BVec::with_capacity_in(16, doc_alloc); + number.extend_from_slice(&ordered); + facet_fn(del_add_facet_value, fid, number, FacetKind::Number); + buffer.clear(); buffer.push(FacetKind::Number as u8); buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(0); // level 0 buffer.extend_from_slice(&ordered); - buffer.extend_from_slice(&n.to_be_bytes()); cache_fn(cached_sorter, &buffer, docid) } else { Ok(()) @@ -169,6 +198,10 @@ impl FacetedDocidsExtractor { // String // key: fid - level - truncated_string Value::String(s) => { + let mut string = BVec::new_in(doc_alloc); + string.extend_from_slice(s.as_bytes()); + facet_fn(del_add_facet_value, fid, string, FacetKind::String); + let normalized = crate::normalize_facet(s); let truncated = truncate_str(&normalized); buffer.clear(); @@ -211,6 +244,84 @@ impl FacetedDocidsExtractor { } } +struct DelAddFacetValue<'doc> { + strings: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, + f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, +} + +impl<'doc> DelAddFacetValue<'doc> { + fn new(doc_alloc: &'doc Bump) -> Self { + Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc) } + } + + fn insert_add(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { + let cache = match kind { + FacetKind::String => &mut self.strings, + FacetKind::Number => &mut self.f64s, + _ => return, + }; + + let key = (fid, value); + if let Some(DelAdd::Deletion) = cache.get(&key) { + cache.remove(&key); + } else { + cache.insert(key, DelAdd::Addition); + } + } + + fn insert_del(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { + let cache = match kind { + FacetKind::String => &mut self.strings, + FacetKind::Number => &mut self.f64s, + _ => return, + }; + + let key = (fid, value); + if let Some(DelAdd::Addition) = cache.get(&key) { + cache.remove(&key); + } else { + cache.insert(key, DelAdd::Deletion); + } + } + + fn send_data( + self, + docid: DocumentId, + sender: &FieldIdDocidFacetSender, + doc_alloc: &Bump, + ) -> std::result::Result<(), crossbeam_channel::SendError<()>> { + println!("sending FieldIdDocidFacet data"); + let mut count = 0; + let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); + for ((fid, value), deladd) in self.strings { + buffer.clear(); + buffer.extend_from_slice(&fid.to_be_bytes()); + buffer.extend_from_slice(&docid.to_be_bytes()); + buffer.extend_from_slice(&value); + match deladd { + DelAdd::Deletion => sender.delete_facet_string(&buffer)?, + DelAdd::Addition => sender.write_facet_string(&buffer)?, + } + count += 1; + } + + count = 0; + for ((fid, value), deladd) in self.f64s { + buffer.clear(); + buffer.extend_from_slice(&fid.to_be_bytes()); + buffer.extend_from_slice(&docid.to_be_bytes()); + buffer.extend_from_slice(&value); + match deladd { + DelAdd::Deletion => sender.delete_facet_f64(&buffer)?, + DelAdd::Addition => sender.write_facet_f64(&buffer)?, + } + count += 1; + } + + Ok(()) + } +} + /// Truncates a string to the biggest valid LMDB key size. fn truncate_str(s: &str) -> &str { let index = s @@ -223,13 +334,23 @@ fn truncate_str(s: &str) -> &str { &s[..index.unwrap_or(0)] } -impl DocidsExtractor for FacetedDocidsExtractor { +impl FacetedDocidsExtractor { #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( + pub fn run_extraction< + 'pl, + 'fid, + 'indexer, + 'index, + 'extractor, + DC: DocumentChanges<'pl>, + MSP, + SP, + >( grenad_parameters: GrenadParameters, document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &'extractor mut ThreadLocal>, + sender: &FieldIdDocidFacetSender, finished_steps: u16, total_steps: u16, step_name: &'static str, @@ -254,6 +375,7 @@ impl DocidsExtractor for FacetedDocidsExtractor { attributes_to_extract: &attributes_to_extract, grenad_parameters, buckets: rayon::current_num_threads(), + sender, }; extract( document_changes, diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index e3b24642e..2cdeca76d 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -223,7 +223,7 @@ where let (finished_steps, step_name) = steps::extract_facets(); facet_field_ids_delta = merge_and_send_facet_docids( - FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?, + FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, &extractor_sender.field_id_docid_facet_sender(), finished_steps, total_steps, step_name)?, FacetDatabases::new(index), index, extractor_sender.facet_docids(), From e627e182ce4479d6abf33fdbc37958cb6d35402a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 13 Nov 2024 13:50:10 +0100 Subject: [PATCH 222/247] Fix facet strings --- crates/milli/src/update/new/channel.rs | 4 ++-- .../new/extract/faceted/extract_facets.rs | 23 +++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 3287a1f7f..fbf102f18 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -364,9 +364,9 @@ impl DocidsSender for FacetDocidsSender<'_> { pub struct FieldIdDocidFacetSender<'a>(&'a ExtractorSender); impl FieldIdDocidFacetSender<'_> { - pub fn write_facet_string(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(&key, &[])); + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(&key, &value)); self.0 .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) } diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 0e7dcc4b9..89223bc55 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -290,22 +290,22 @@ impl<'doc> DelAddFacetValue<'doc> { sender: &FieldIdDocidFacetSender, doc_alloc: &Bump, ) -> std::result::Result<(), crossbeam_channel::SendError<()>> { - println!("sending FieldIdDocidFacet data"); - let mut count = 0; let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); for ((fid, value), deladd) in self.strings { - buffer.clear(); - buffer.extend_from_slice(&fid.to_be_bytes()); - buffer.extend_from_slice(&docid.to_be_bytes()); - buffer.extend_from_slice(&value); - match deladd { - DelAdd::Deletion => sender.delete_facet_string(&buffer)?, - DelAdd::Addition => sender.write_facet_string(&buffer)?, + if let Ok(s) = std::str::from_utf8(&value) { + buffer.clear(); + buffer.extend_from_slice(&fid.to_be_bytes()); + buffer.extend_from_slice(&docid.to_be_bytes()); + let normalized = crate::normalize_facet(s); + let truncated = truncate_str(&normalized); + buffer.extend_from_slice(truncated.as_bytes()); + match deladd { + DelAdd::Deletion => sender.delete_facet_string(&buffer)?, + DelAdd::Addition => sender.write_facet_string(&buffer, &value)?, + } } - count += 1; } - count = 0; for ((fid, value), deladd) in self.f64s { buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes()); @@ -315,7 +315,6 @@ impl<'doc> DelAddFacetValue<'doc> { DelAdd::Deletion => sender.delete_facet_f64(&buffer)?, DelAdd::Addition => sender.write_facet_f64(&buffer)?, } - count += 1; } Ok(()) From 8e5b1a3ec198cf280fb08fcbdd988ab8ab9fd13a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 13 Nov 2024 14:15:42 +0100 Subject: [PATCH 223/247] Compute the field distribution and convert _geo into an f64s --- .../tests/documents/add_documents.rs | 30 ++++++------ crates/meilisearch/tests/search/geo.rs | 4 +- crates/milli/src/update/new/document.rs | 2 + .../milli/src/update/new/extract/documents.rs | 12 +++-- .../new/extract/faceted/extract_facets.rs | 8 ++-- .../new/extract/faceted/facet_document.rs | 19 +++++++- .../milli/src/update/new/extract/geo/mod.rs | 47 ++++++++++++++----- .../extract/searchable/extract_word_docids.rs | 2 +- .../extract_word_pair_proximity_docids.rs | 2 +- crates/milli/src/update/new/merger.rs | 2 +- 10 files changed, 86 insertions(+), 42 deletions(-) diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs index 8c9601e0f..17b1d6697 100644 --- a/crates/meilisearch/tests/documents/add_documents.rs +++ b/crates/meilisearch/tests/documents/add_documents.rs @@ -1664,7 +1664,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "The `_geo` field in the document with the id: `11` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.", + "message": "The `_geo` field in the document with the id: `\"11\"` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1701,7 +1701,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude nor longitude in the document with the id: `11`. Was expecting `_geo.lat` and `_geo.lng` fields.", + "message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1738,7 +1738,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude nor longitude in the document with the id: `11`. Was expecting `_geo.lat` and `_geo.lng` fields.", + "message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1775,7 +1775,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `11`. Was expecting a `_geo.lng` field.", + "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1812,7 +1812,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `11`. Was expecting a `_geo.lat` field.", + "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1849,7 +1849,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `11`. Was expecting a `_geo.lng` field.", + "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1886,7 +1886,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `11`. Was expecting a `_geo.lat` field.", + "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1923,7 +1923,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `11`. Was expecting finite numbers but instead got `false` and `true`.", + "message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `false` and `true`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1960,7 +1960,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `11`. Was expecting a `_geo.lng` field.", + "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1997,7 +1997,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `11`. Was expecting a `_geo.lat` field.", + "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2034,7 +2034,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `11`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.", + "message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2071,7 +2071,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "The `_geo` field in the document with the id: `11` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.", + "message": "The `_geo` field in the document with the id: `\"11\"` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2109,7 +2109,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse longitude in the document with the id: `12`. Was expecting a finite number but instead got `null`.", + "message": "Could not parse longitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2145,7 +2145,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude in the document with the id: `12`. Was expecting a finite number but instead got `null`.", + "message": "Could not parse latitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2181,7 +2181,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `13`. Was expecting finite numbers but instead got `null` and `null`.", + "message": "Could not parse latitude nor longitude in the document with the id: `\"13\"`. Was expecting finite numbers but instead got `null` and `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" diff --git a/crates/meilisearch/tests/search/geo.rs b/crates/meilisearch/tests/search/geo.rs index 7804f1ad0..e92056191 100644 --- a/crates/meilisearch/tests/search/geo.rs +++ b/crates/meilisearch/tests/search/geo.rs @@ -70,8 +70,8 @@ async fn geo_bounding_box_with_string_and_number() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["_geo"])).await; index.update_settings_sortable_attributes(json!(["_geo"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (ret, _code) = index.add_documents(documents, None).await; + index.wait_task(ret.uid()).await.succeeded(); index .search( diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index 8d4e3b0a9..ddf508ad7 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -354,6 +354,8 @@ where if let Some(geo_value) = document.geo_field()? { let fid = fields_ids_map.id_or_insert("_geo").ok_or(UserError::AttributeLimitReached)?; + fields_ids_map.id_or_insert("_geo.lat").ok_or(UserError::AttributeLimitReached)?; + fields_ids_map.id_or_insert("_geo.lng").ok_or(UserError::AttributeLimitReached)?; unordered_field_buffer.push((fid, geo_value)); } diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index b76fe207a..42fce3c3d 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -58,7 +58,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { context.index, &context.db_fields_ids_map, )?; - for res in content.iter_top_level_fields() { + let geo_iter = content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data .field_distribution_delta @@ -73,7 +74,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { let docid = update.docid(); let content = update.current(&context.rtxn, context.index, &context.db_fields_ids_map)?; - for res in content.iter_top_level_fields() { + let geo_iter = content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data .field_distribution_delta @@ -82,7 +84,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { *entry -= 1; } let content = update.updated(); - for res in content.iter_top_level_fields() { + let geo_iter = content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data .field_distribution_delta @@ -111,7 +114,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { DocumentChange::Insertion(insertion) => { let docid = insertion.docid(); let content = insertion.inserted(); - for res in content.iter_top_level_fields() { + let geo_iter = content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data .field_distribution_delta diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 89223bc55..19e908612 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -1,18 +1,16 @@ use std::cell::RefCell; use std::collections::HashSet; -use std::mem::size_of; use std::ops::DerefMut as _; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; use hashbrown::HashMap; -use heed::{BytesDecode, RoTxn}; +use heed::RoTxn; use serde_json::Value; use super::super::cache::BalancedCaches; use super::facet_document::extract_document_facets; use super::FacetKind; -use crate::facet::value_encoding::f64_into_bytes; use crate::heed_codec::facet::OrderedF64Codec; use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; @@ -80,6 +78,7 @@ impl FacetedDocidsExtractor { DocumentChange::Deletion(inner) => extract_document_facets( attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, + inner.external_document_id(), new_fields_ids_map.deref_mut(), &mut |fid, value| { Self::facet_fn_with_options( @@ -98,6 +97,7 @@ impl FacetedDocidsExtractor { extract_document_facets( attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, + inner.external_document_id(), new_fields_ids_map.deref_mut(), &mut |fid, value| { Self::facet_fn_with_options( @@ -116,6 +116,7 @@ impl FacetedDocidsExtractor { extract_document_facets( attributes_to_extract, inner.merged(rtxn, index, context.db_fields_ids_map)?, + inner.external_document_id(), new_fields_ids_map.deref_mut(), &mut |fid, value| { Self::facet_fn_with_options( @@ -134,6 +135,7 @@ impl FacetedDocidsExtractor { DocumentChange::Insertion(inner) => extract_document_facets( attributes_to_extract, inner.inserted(), + inner.external_document_id(), new_fields_ids_map.deref_mut(), &mut |fid, value| { Self::facet_fn_with_options( diff --git a/crates/milli/src/update/new/extract/faceted/facet_document.rs b/crates/milli/src/update/new/extract/faceted/facet_document.rs index 4308d0aa5..141af7fbe 100644 --- a/crates/milli/src/update/new/extract/faceted/facet_document.rs +++ b/crates/milli/src/update/new/extract/faceted/facet_document.rs @@ -1,17 +1,18 @@ use serde_json::Value; use crate::update::new::document::Document; +use crate::update::new::extract::geo::extract_geo_coordinates; use crate::update::new::extract::perm_json_p; use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; pub fn extract_document_facets<'doc>( attributes_to_extract: &[&str], document: impl Document<'doc>, + external_document_id: &str, field_id_map: &mut GlobalFieldsIdsMap, facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>, ) -> Result<()> { - let geo = document.geo_field().transpose().map(|res| res.map(|rval| ("_geo", rval))); - for res in document.iter_top_level_fields().chain(geo) { + for res in document.iter_top_level_fields() { let (field_name, value) = res?; let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) { @@ -42,5 +43,19 @@ pub fn extract_document_facets<'doc>( } } + if attributes_to_extract.contains(&"_geo") { + if let Some(geo_value) = document.geo_field()? { + if let Some([lat, lng]) = extract_geo_coordinates(external_document_id, geo_value)? { + let (lat_fid, lng_fid) = field_id_map + .id_or_insert("_geo.lat") + .zip(field_id_map.id_or_insert("_geo.lng")) + .ok_or(UserError::AttributeLimitReached)?; + + facet_fn(lat_fid, &lat.into())?; + facet_fn(lng_fid, &lng.into())?; + } + } + } + Ok(()) } diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index 180611eee..e26a7dc6c 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -4,7 +4,7 @@ use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _}; use std::{iter, mem, result}; use bumpalo::Bump; -use bytemuck::{bytes_of, from_bytes, pod_read_unaligned, Pod, Zeroable}; +use bytemuck::{bytes_of, pod_read_unaligned, Pod, Zeroable}; use heed::RoTxn; use serde_json::value::RawValue; use serde_json::Value; @@ -15,7 +15,7 @@ use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extra use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; -use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Object, Result}; +use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Result}; pub struct GeoExtractor { grenad_parameters: GrenadParameters, @@ -244,7 +244,10 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { /// Extracts and validate the latitude and latitude from a document geo field. /// /// It can be of the form `{ "lat": 0.0, "lng": "1.0" }`. -fn extract_geo_coordinates(external_id: &str, raw_value: &RawValue) -> Result> { +pub fn extract_geo_coordinates( + external_id: &str, + raw_value: &RawValue, +) -> Result> { let mut geo = match serde_json::from_str(raw_value.get()).map_err(InternalError::SerdeJson)? { Value::Null => return Ok(None), Value::Object(map) => map, @@ -256,12 +259,22 @@ fn extract_geo_coordinates(external_id: &str, raw_value: &RawValue) -> Result [lat, lng], + (Some(lat), Some(lng)) => { + if geo.is_empty() { + [lat, lng] + } else { + return Err(GeoError::UnexpectedExtraFields { + document_id: Value::from(external_id), + value: Value::from(geo), + } + .into()); + } + } (Some(_), None) => { - return Err(GeoError::MissingLatitude { document_id: Value::from(external_id) }.into()) + return Err(GeoError::MissingLongitude { document_id: Value::from(external_id) }.into()) } (None, Some(_)) => { - return Err(GeoError::MissingLongitude { document_id: Value::from(external_id) }.into()) + return Err(GeoError::MissingLatitude { document_id: Value::from(external_id) }.into()) } (None, None) => { return Err(GeoError::MissingLatitudeAndLongitude { @@ -271,13 +284,21 @@ fn extract_geo_coordinates(external_id: &str, raw_value: &RawValue) -> Result Ok(Some([lat, lng])), + (Ok(_), Err(value)) => { + Err(GeoError::BadLongitude { document_id: Value::from(external_id), value }.into()) + } + (Err(value), Ok(_)) => { + Err(GeoError::BadLatitude { document_id: Value::from(external_id), value }.into()) + } + (Err(lat), Err(lng)) => Err(GeoError::BadLatitudeAndLongitude { + document_id: Value::from(external_id), + lat, + lng, + } + .into()), + } } /// Extracts and validate that a serde JSON Value is actually a finite f64. diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 0223895e6..c67fc347a 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -419,6 +419,6 @@ impl WordDocidsExtractors { } fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(vec![]) + Ok(vec!["_geo"]) } } diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index f637cff49..bbc6365df 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -25,7 +25,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { } fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(vec![]) + Ok(vec!["_geo"]) } // This method is reimplemented to count the number of words in the document in each field diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index c81f84f43..c0ff93901 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -50,7 +50,7 @@ where let mut file = tempfile::tempfile()?; /// manage error - bincode::serialize_into(&mut file, dbg!(&rtree)).unwrap(); + bincode::serialize_into(&mut file, &rtree).unwrap(); file.sync_all()?; let rtree_mmap = unsafe { Mmap::map(&file)? }; From 40dd25d6b2c60a70873ed828ec3b8e4404d44900 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 13 Nov 2024 22:09:36 +0100 Subject: [PATCH 224/247] Fix issue with Replace document method when adding and deleting a document in the same batch --- crates/milli/src/update/new/indexer/document_operation.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index c0f1ffbdd..d4f11389a 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -310,10 +310,10 @@ impl MergeChanges for MergeDocumentForReplacement { } Some(InnerDocOp::Deletion) => { return if is_new { + Ok(None) + } else { let deletion = Deletion::create(docid, external_doc); Ok(Some(DocumentChange::Deletion(deletion))) - } else { - Ok(None) }; } None => unreachable!("We must not have empty set of operations on a document"), From 695c2c6b99b650ad94eb028f4f9aed2ca853c9f7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 14 Nov 2024 08:42:39 +0100 Subject: [PATCH 225/247] Cosmetic fix --- crates/milli/src/update/new/indexer/document_operation.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index d4f11389a..a164d099e 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -356,11 +356,11 @@ impl MergeChanges for MergeDocumentForUpdates { let has_deletion = last_deletion.is_some(); if operations.is_empty() { - return if !is_new { + return if is_new { + Ok(None) + } else { let deletion = Deletion::create(docid, external_docid); Ok(Some(DocumentChange::Deletion(deletion))) - } else { - Ok(None) }; } From 0e3c5d91aba63ca4661c64bf739be9fc9bccd95d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 14 Nov 2024 08:42:56 +0100 Subject: [PATCH 226/247] Document deletion test passes --- crates/milli/src/update/new/indexer/document_deletion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index e89b04223..353995a59 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -132,7 +132,7 @@ mod test { } let mut deletions = DocumentDeletion::new(); - deletions.delete_documents_by_docids(vec![0, 2, 42].into_iter().collect()); + deletions.delete_documents_by_docids(Vec::::new().into_iter().collect()); let indexer = Bump::new(); let index = TempIndex::new(); From 9e8367f1e6fbcd79746971726877371bb2fefeb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 14 Nov 2024 10:40:32 +0100 Subject: [PATCH 227/247] Move the rayon thread pool outside the extract method --- crates/index-scheduler/src/batch.rs | 126 +++-- crates/milli/src/search/new/matches/mod.rs | 7 +- crates/milli/src/update/new/channel.rs | 4 +- .../milli/src/update/new/extract/documents.rs | 12 +- .../src/update/new/facet_search_builder.rs | 4 +- .../src/update/new/fst_merger_builder.rs | 6 +- crates/milli/src/update/new/indexer/mod.rs | 450 +++++++++--------- crates/milli/src/update/new/merger.rs | 1 - .../milli/src/update/new/vector_document.rs | 2 +- .../milli/src/update/new/word_fst_builder.rs | 7 +- 10 files changed, 328 insertions(+), 291 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 4ae8c7d46..fb47c705a 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -39,7 +39,7 @@ use meilisearch_types::milli::update::{IndexDocumentsMethod, Settings as MilliSe use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; -use meilisearch_types::milli::{self, Filter}; +use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; @@ -1277,7 +1277,6 @@ impl IndexScheduler { operations, mut tasks, } => { - let indexer_config = self.index_mapper.indexer_config(); // TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches. // this is made difficult by the fact we're doing private clones of the index scheduler and sending it // to a fresh thread. @@ -1386,10 +1385,16 @@ impl IndexScheduler { } if tasks.iter().any(|res| res.error.is_none()) { - /// TODO create a pool if needed - // let pool = indexer_config.thread_pool.unwrap(); - let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); + let local_pool; + let pool = match &self.index_mapper.indexer_config().thread_pool { + Some(pool) => pool, + None => { + local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + &local_pool + } + }; + // TODO we want to multithread this let document_changes = indexer.into_changes( &indexer_alloc, index, @@ -1398,18 +1403,20 @@ impl IndexScheduler { &mut new_fields_ids_map, )?; - indexer::index( - index_wtxn, - index, - &db_fields_ids_map, - new_fields_ids_map, - primary_key_has_been_set.then_some(primary_key), - &pool, - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - )?; + pool.install(|| { + indexer::index( + index_wtxn, + index, + &db_fields_ids_map, + new_fields_ids_map, + primary_key_has_been_set.then_some(primary_key), + &document_changes, + embedders, + &|| must_stop_processing.get(), + &send_progress, + ) + }) + .unwrap()?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } @@ -1489,27 +1496,37 @@ impl IndexScheduler { let result_count = Ok((candidates.len(), candidates.len())) as Result<_>; if task.error.is_none() { - /// TODO create a pool if needed - // let pool = indexer_config.thread_pool.unwrap(); - let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); + let local_pool; + let pool = match &self.index_mapper.indexer_config().thread_pool { + Some(pool) => pool, + None => { + local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + &local_pool + } + }; - let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); - let document_changes = indexer.into_changes(&primary_key)?; - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; + pool.install(|| { + let indexer = + UpdateByFunction::new(candidates, context.clone(), code.clone()); + let document_changes = indexer.into_changes(&primary_key)?; + let embedders = index.embedding_configs(index_wtxn)?; + let embedders = self.embedders(embedders)?; - indexer::index( - index_wtxn, - index, - &db_fields_ids_map, - new_fields_ids_map, - None, // cannot change primary key in DocumentEdition - &pool, - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - )?; + indexer::index( + index_wtxn, + index, + &db_fields_ids_map, + new_fields_ids_map, + None, // cannot change primary key in DocumentEdition + &document_changes, + embedders, + &|| must_stop_processing.get(), + &send_progress, + )?; + + Result::Ok(()) + }) + .unwrap()?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } @@ -1629,9 +1646,14 @@ impl IndexScheduler { .map_err(milli::Error::from)?; if !tasks.iter().all(|res| res.error.is_some()) { - /// TODO create a pool if needed - // let pool = indexer_config.thread_pool.unwrap(); - let pool = rayon::ThreadPoolBuilder::new().build().unwrap(); + let local_pool; + let pool = match &self.index_mapper.indexer_config().thread_pool { + Some(pool) => pool, + None => { + local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + &local_pool + } + }; let mut indexer = indexer::DocumentDeletion::new(); indexer.delete_documents_by_docids(to_delete); @@ -1639,18 +1661,20 @@ impl IndexScheduler { let embedders = index.embedding_configs(index_wtxn)?; let embedders = self.embedders(embedders)?; - indexer::index( - index_wtxn, - index, - &db_fields_ids_map, - new_fields_ids_map, - None, // document deletion never changes primary key - &pool, - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - )?; + pool.install(|| { + indexer::index( + index_wtxn, + index, + &db_fields_ids_map, + new_fields_ids_map, + None, // document deletion never changes primary key + &document_changes, + embedders, + &|| must_stop_processing.get(), + &send_progress, + ) + }) + .unwrap()?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 80e3ec7b2..ba639b7f2 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -3,6 +3,9 @@ mod r#match; mod matching_words; mod simple_token_kind; +use std::borrow::Cow; +use std::cmp::{max, min}; + use charabia::{Language, SeparatorKind, Token, Tokenizer}; use either::Either; pub use matching_words::MatchingWords; @@ -10,10 +13,6 @@ use matching_words::{MatchType, PartialMatch}; use r#match::{Match, MatchPosition}; use serde::Serialize; use simple_token_kind::SimpleTokenKind; -use std::{ - borrow::Cow, - cmp::{max, min}, -}; const DEFAULT_CROP_MARKER: &str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index fbf102f18..9e8039ffd 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -366,14 +366,14 @@ pub struct FieldIdDocidFacetSender<'a>(&'a ExtractorSender); impl FieldIdDocidFacetSender<'_> { pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(&key, &value)); + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); self.0 .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) } pub fn write_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(&key, &[])); + let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &[])); self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) } diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 42fce3c3d..a324d2914 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -58,7 +58,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { context.index, &context.db_fields_ids_map, )?; - let geo_iter = content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + let geo_iter = + content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data @@ -74,7 +75,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { let docid = update.docid(); let content = update.current(&context.rtxn, context.index, &context.db_fields_ids_map)?; - let geo_iter = content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + let geo_iter = + content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data @@ -84,7 +86,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { *entry -= 1; } let content = update.updated(); - let geo_iter = content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + let geo_iter = + content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data @@ -114,7 +117,8 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { DocumentChange::Insertion(insertion) => { let docid = insertion.docid(); let content = insertion.inserted(); - let geo_iter = content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + let geo_iter = + content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data diff --git a/crates/milli/src/update/new/facet_search_builder.rs b/crates/milli/src/update/new/facet_search_builder.rs index 7eaec95a5..0c924bff4 100644 --- a/crates/milli/src/update/new/facet_search_builder.rs +++ b/crates/milli/src/update/new/facet_search_builder.rs @@ -6,11 +6,9 @@ use grenad::Sorter; use heed::types::{Bytes, SerdeJson}; use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn}; -use super::extract::FacetKind; use super::fst_merger_builder::FstMergerBuilder; use super::KvReaderDelAdd; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; -use crate::heed_codec::StrRefCodec; +use crate::heed_codec::facet::FacetGroupKey; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::{create_sorter, MergeDeladdBtreesetString}; use crate::{ diff --git a/crates/milli/src/update/new/fst_merger_builder.rs b/crates/milli/src/update/new/fst_merger_builder.rs index 9fd259ce6..1c584ef53 100644 --- a/crates/milli/src/update/new/fst_merger_builder.rs +++ b/crates/milli/src/update/new/fst_merger_builder.rs @@ -1,10 +1,12 @@ -use std::{fs::File, io::BufWriter}; +use std::fs::File; +use std::io::BufWriter; use fst::{Set, SetBuilder, Streamer}; use memmap2::Mmap; use tempfile::tempfile; -use crate::{update::del_add::DelAdd, InternalError, Result}; +use crate::update::del_add::DelAdd; +use crate::{InternalError, Result}; pub struct FstMergerBuilder<'a> { stream: Option>, diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 2cdeca76d..1a5e4fc23 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -13,7 +13,6 @@ use itertools::{merge_join_by, EitherOrBoth}; pub use partial_dump::PartialDump; use rand::SeedableRng as _; use raw_collections::RawMap; -use rayon::ThreadPool; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; @@ -136,7 +135,6 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( db_fields_ids_map: &'indexer FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, new_primary_key: Option>, - pool: &ThreadPool, document_changes: &DC, embedders: EmbeddingConfigs, must_stop_processing: &'indexer MSP, @@ -152,9 +150,9 @@ where let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); let new_fields_ids_map = RwLock::new(new_fields_ids_map); - let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads()); - let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); - let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); + let fields_ids_map_store = ThreadLocal::with_capacity(rayon::current_num_threads()); + let mut extractor_allocs = ThreadLocal::with_capacity(rayon::current_num_threads()); + let doc_allocs = ThreadLocal::with_capacity(rayon::current_num_threads()); let indexing_context = IndexingContext { index, @@ -179,248 +177,260 @@ where let document_ids = &mut document_ids; // TODO manage the errors correctly let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { - let result = pool.in_place_scope(|_s| { - let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); + let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); + let _entered = span.enter(); + + let rtxn = index.read_txn()?; + + // document but we need to create a function that collects and compresses documents. + let document_sender = extractor_sender.documents(); + let document_extractor = DocumentsExtractor::new(&document_sender, embedders); + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + let (finished_steps, step_name) = steps::extract_documents(); + extract(document_changes, + &document_extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + finished_steps, + total_steps, + step_name, + )?; + + for document_extractor_data in datastore { + let document_extractor_data = document_extractor_data.0.into_inner(); + for (field, delta) in document_extractor_data.field_distribution_delta { + let current = field_distribution.entry(field).or_default(); + // adding the delta should never cause a negative result, as we are removing fields that previously existed. + *current = current.saturating_add_signed(delta); + } + document_extractor_data.docids_delta.apply_to(document_ids); + } + + field_distribution.retain(|_, v| *v != 0); + + const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; + let current_num_threads = rayon::current_num_threads(); + let max_memory = TEN_GIB / current_num_threads; + eprintln!("A maximum of {max_memory} bytes will be used for each of the {current_num_threads} threads"); + + let grenad_parameters = GrenadParameters { + max_memory: Some(max_memory), + ..GrenadParameters::default() + }; + + let facet_field_ids_delta; + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); let _entered = span.enter(); - let rtxn = index.read_txn()?; + let (finished_steps, step_name) = steps::extract_facets(); - // document but we need to create a function that collects and compresses documents. - let document_sender = extractor_sender.documents(); - let document_extractor = DocumentsExtractor::new(&document_sender, embedders); - let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); - let (finished_steps, step_name) = steps::extract_documents(); - extract(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; - - for document_extractor_data in datastore { - let document_extractor_data = document_extractor_data.0.into_inner(); - for (field, delta) in document_extractor_data.field_distribution_delta { - let current = field_distribution.entry(field).or_default(); - // adding the delta should never cause a negative result, as we are removing fields that previously existed. - *current = current.saturating_add_signed(delta); - } - document_extractor_data.docids_delta.apply_to(document_ids); - } - - field_distribution.retain(|_, v| *v != 0); - - const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; - let current_num_threads = rayon::current_num_threads(); - let max_memory = TEN_GIB / current_num_threads; - eprintln!("A maximum of {max_memory} bytes will be used for each of the {current_num_threads} threads"); - - let grenad_parameters = GrenadParameters { - max_memory: Some(max_memory), - ..GrenadParameters::default() - }; - - let facet_field_ids_delta; - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); - let _entered = span.enter(); - - let (finished_steps, step_name) = steps::extract_facets(); - - facet_field_ids_delta = merge_and_send_facet_docids( - FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, &extractor_sender.field_id_docid_facet_sender(), finished_steps, total_steps, step_name)?, - FacetDatabases::new(index), - index, - extractor_sender.facet_docids(), - )?; - } - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); - let _entered = span.enter(); - let (finished_steps, step_name) = steps::extract_words(); - - let WordDocidsCaches { - word_docids, - word_fid_docids, - exact_word_docids, - word_position_docids, - fid_word_count_docids, - } = WordDocidsExtractors::run_extraction( - grenad_parameters, + facet_field_ids_delta = merge_and_send_facet_docids( + FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, + &extractor_sender.field_id_docid_facet_sender(), finished_steps, total_steps, step_name, - )?; + )?, + FacetDatabases::new(index), + index, + extractor_sender.facet_docids(), + )?; + } - // TODO Word Docids Merger - // extractor_sender.send_searchable::(word_docids).unwrap(); - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_docids, - index.word_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); + let _entered = span.enter(); + let (finished_steps, step_name) = steps::extract_words(); - // Word Fid Docids Merging - // extractor_sender.send_searchable::(word_fid_docids).unwrap(); - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_fid_docids, - index.word_fid_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } + let WordDocidsCaches { + word_docids, + word_fid_docids, + exact_word_docids, + word_position_docids, + fid_word_count_docids, + } = WordDocidsExtractors::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + finished_steps, + total_steps, + step_name, + )?; - // Exact Word Docids Merging - // extractor_sender.send_searchable::(exact_word_docids).unwrap(); - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - exact_word_docids, - index.exact_word_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - - // Word Position Docids Merging - // extractor_sender.send_searchable::(word_position_docids).unwrap(); - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_position_docids, - index.word_position_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - - // Fid Word Count Docids Merging - // extractor_sender.send_searchable::(fid_word_count_docids).unwrap(); - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); - let _entered = span.enter(); - merge_and_send_docids( - fid_word_count_docids, - index.field_id_word_count_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - } - - // run the proximity extraction only if the precision is by word - // this works only if the settings didn't change during this transaction. - let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); - if proximity_precision == ProximityPrecision::ByWord { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); + // TODO Word Docids Merger + // extractor_sender.send_searchable::(word_docids).unwrap(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); let _entered = span.enter(); - - let (finished_steps, step_name) = steps::extract_word_proximity(); - - let caches = ::run_extraction(grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - finished_steps, - total_steps, - step_name, - )?; - merge_and_send_docids( - caches, - index.word_pair_proximity_docids.remap_types(), + word_docids, + index.word_docids.remap_types(), index, - extractor_sender.docids::(), + extractor_sender.docids::(), &indexing_context.must_stop_processing, )?; } - 'vectors: { - let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); + // Word Fid Docids Merging + // extractor_sender.send_searchable::(word_fid_docids).unwrap(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); let _entered = span.enter(); - - let mut index_embeddings = index.embedding_configs(&rtxn)?; - if index_embeddings.is_empty() { - break 'vectors; - } - - let embedding_sender = extractor_sender.embeddings(); - let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); - let mut datastore = ThreadLocal::with_capacity(pool.current_num_threads()); - let (finished_steps, step_name) = steps::extract_embeddings(); - extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; - - for config in &mut index_embeddings { - 'data: for data in datastore.iter_mut() { - let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided); - } - } - - embedding_sender.finish(index_embeddings).unwrap(); - } - - 'geo: { - let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); - let _entered = span.enter(); - - // let geo_sender = extractor_sender.geo_points(); - let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { - break 'geo; - }; - let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); - let (finished_steps, step_name) = steps::extract_geo_points(); - extract(document_changes, - &extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - finished_steps, - total_steps, - step_name, - )?; - - merge_and_send_rtree( - datastore, - &rtxn, + merge_and_send_docids( + word_fid_docids, + index.word_fid_docids.remap_types(), index, - extractor_sender.geo(), + extractor_sender.docids::(), &indexing_context.must_stop_processing, )?; } - // TODO THIS IS TOO MUCH - // - [ ] Extract fieldid docid facet number - // - [ ] Extract fieldid docid facet string - // - [ ] Extract facetid string fst - // - [ ] Extract facetid normalized string strings + // Exact Word Docids Merging + // extractor_sender.send_searchable::(exact_word_docids).unwrap(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); + let _entered = span.enter(); + merge_and_send_docids( + exact_word_docids, + index.exact_word_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - // TODO Inverted Indexes again - // - [x] Extract fieldid facet isempty docids - // - [x] Extract fieldid facet isnull docids - // - [x] Extract fieldid facet exists docids + // Word Position Docids Merging + // extractor_sender.send_searchable::(word_position_docids).unwrap(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_position_docids, + index.word_position_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - // TODO This is the normal system - // - [x] Extract fieldid facet number docids - // - [x] Extract fieldid facet string docids + // Fid Word Count Docids Merging + // extractor_sender.send_searchable::(fid_word_count_docids).unwrap(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); + let _entered = span.enter(); + merge_and_send_docids( + fid_word_count_docids, + index.field_id_word_count_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + } - Result::Ok(facet_field_ids_delta) - }); + // run the proximity extraction only if the precision is by word + // this works only if the settings didn't change during this transaction. + let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); + if proximity_precision == ProximityPrecision::ByWord { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); + let _entered = span.enter(); + + let (finished_steps, step_name) = steps::extract_word_proximity(); + + let caches = ::run_extraction(grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + finished_steps, + total_steps, + step_name, + )?; + + merge_and_send_docids( + caches, + index.word_pair_proximity_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + + 'vectors: { + let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); + let _entered = span.enter(); + + let mut index_embeddings = index.embedding_configs(&rtxn)?; + if index_embeddings.is_empty() { + break 'vectors; + } + + let embedding_sender = extractor_sender.embeddings(); + let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); + let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + let (finished_steps, step_name) = steps::extract_embeddings(); + extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; + + for config in &mut index_embeddings { + 'data: for data in datastore.iter_mut() { + let data = &mut data.get_mut().0; + let Some(deladd) = data.remove(&config.name) else { continue 'data; }; + deladd.apply_to(&mut config.user_provided); + } + } + + embedding_sender.finish(index_embeddings).unwrap(); + } + + 'geo: { + let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); + let _entered = span.enter(); + + // let geo_sender = extractor_sender.geo_points(); + let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { + break 'geo; + }; + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + let (finished_steps, step_name) = steps::extract_geo_points(); + extract(document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + finished_steps, + total_steps, + step_name, + )?; + + merge_and_send_rtree( + datastore, + &rtxn, + index, + extractor_sender.geo(), + &indexing_context.must_stop_processing, + )?; + } + + // TODO THIS IS TOO MUCH + // - [ ] Extract fieldid docid facet number + // - [ ] Extract fieldid docid facet string + // - [ ] Extract facetid string fst + // - [ ] Extract facetid normalized string strings + + // TODO Inverted Indexes again + // - [x] Extract fieldid facet isempty docids + // - [x] Extract fieldid facet isnull docids + // - [x] Extract fieldid facet exists docids + + // TODO This is the normal system + // - [x] Extract fieldid facet number docids + // - [x] Extract fieldid facet string docids { let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); @@ -429,7 +439,7 @@ where (indexing_context.send_progress)(Progress { finished_steps, total_steps, step_name, finished_total_documents: None }); } - result + Result::Ok(facet_field_ids_delta) })?; let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index c0ff93901..9d0d8e176 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -1,5 +1,4 @@ use std::cell::RefCell; -use std::io; use hashbrown::HashSet; use heed::types::Bytes; diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 736456f0f..319730db0 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -286,7 +286,7 @@ impl<'doc> MergedVectorDocument<'doc> { ) -> Result> { let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?; let new_doc = - VectorDocumentFromVersions::new(&external_document_id, versions, doc_alloc, embedders)?; + VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)?; Ok(if db.is_none() && new_doc.is_none() { None } else { Some(Self { new_doc, db }) }) } diff --git a/crates/milli/src/update/new/word_fst_builder.rs b/crates/milli/src/update/new/word_fst_builder.rs index 834266045..2b1c4604b 100644 --- a/crates/milli/src/update/new/word_fst_builder.rs +++ b/crates/milli/src/update/new/word_fst_builder.rs @@ -1,13 +1,14 @@ +use std::collections::HashSet; use std::io::BufWriter; use fst::{Set, SetBuilder, Streamer}; use memmap2::Mmap; -use std::collections::HashSet; use tempfile::tempfile; -use crate::{index::PrefixSettings, update::del_add::DelAdd, InternalError, Prefix, Result}; - use super::fst_merger_builder::FstMergerBuilder; +use crate::index::PrefixSettings; +use crate::update::del_add::DelAdd; +use crate::{InternalError, Prefix, Result}; pub struct WordFstBuilder<'a> { word_fst_builder: FstMergerBuilder<'a>, From 91c58cfa382640c0098e20497665dd6b28b2f622 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 14 Nov 2024 11:40:12 +0100 Subject: [PATCH 228/247] Fix positional databases --- .../new/extract/searchable/tokenize_document.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index 793e3a249..e6c8776c6 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -3,7 +3,6 @@ use std::collections::HashMap; use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use serde_json::Value; -use crate::proximity::MAX_DISTANCE; use crate::update::new::document::Document; use crate::update::new::extract::perm_json_p::{ seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, @@ -13,6 +12,9 @@ use crate::{ MAX_WORD_LENGTH, }; +// todo: should be crate::proximity::MAX_DISTANCE but it has been forgotten +const MAX_DISTANCE: u32 = 8; + pub struct DocumentTokenizer<'a> { pub tokenizer: &'a Tokenizer<'a>, pub attribute_to_extract: Option<&'a [&'a str]>, @@ -251,22 +253,22 @@ mod test { ]: "doggo", [ 2, - MAX_DISTANCE, + 8, ]: "doggo", [ 2, 16, ]: "catto", [ - 3, + 5, 0, ]: "10", [ - 4, + 6, 0, ]: "pesti", [ - 5, + 7, 0, ]: "23", } From 4ff2b3c2ee792186563db040f6fa04595825992e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 14 Nov 2024 15:45:04 +0100 Subject: [PATCH 229/247] Fix test on locales --- .../update/new/extract/searchable/tokenize_document.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index e6c8776c6..bc7a2acd3 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -35,8 +35,8 @@ impl<'a> DocumentTokenizer<'a> { for entry in document.iter_top_level_fields() { let (field_name, value) = entry?; - let mut tokenize_field = |name: &str, value: &Value| { - let Some(field_id) = field_id_map.id_or_insert(name) else { + let mut tokenize_field = |field_name: &str, value: &Value| { + let Some(field_id) = field_id_map.id_or_insert(field_name) else { return Err(UserError::AttributeLimitReached.into()); }; @@ -52,7 +52,7 @@ impl<'a> DocumentTokenizer<'a> { Value::Number(n) => { let token = n.to_string(); if let Ok(position) = (*position).try_into() { - token_fn(name, field_id, position, token.as_str())?; + token_fn(field_name, field_id, position, token.as_str())?; } Ok(()) @@ -76,7 +76,7 @@ impl<'a> DocumentTokenizer<'a> { if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { *position = index; if let Ok(position) = (*position).try_into() { - token_fn(name, field_id, position, token)?; + token_fn(field_name, field_id, position, token)?; } } } From 83865d2ebd4793f8ccf7aafc90f0063a1ce238fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 14 Nov 2024 16:00:11 +0100 Subject: [PATCH 230/247] Expose intermediate errors when processing batches --- crates/index-scheduler/src/batch.rs | 75 ++- .../update/new/indexer/document_operation.rs | 548 ++++++++++++------ crates/milli/src/update/new/indexer/mod.rs | 2 +- 3 files changed, 409 insertions(+), 216 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index fb47c705a..fb9cfbe6c 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -39,7 +39,7 @@ use meilisearch_types::milli::update::{IndexDocumentsMethod, Settings as MilliSe use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; -use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; +use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbortBuilder}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; @@ -1331,55 +1331,19 @@ impl IndexScheduler { let mut indexer = indexer::DocumentOperation::new(method); let embedders = index.embedding_configs(index_wtxn)?; let embedders = self.embedders(embedders)?; - for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) { + for operation in operations { match operation { DocumentOperation::Add(_content_uuid) => { let mmap = content_files_iter.next().unwrap(); - let stats = indexer.add_documents(mmap)?; + indexer.add_documents(mmap)?; // builder = builder.with_embedders(embedders.clone()); - - let received_documents = - if let Some(Details::DocumentAdditionOrUpdate { - received_documents, - .. - }) = task.details - { - received_documents - } else { - // In the case of a `documentAdditionOrUpdate` the details MUST be set - unreachable!(); - }; - - task.status = Status::Succeeded; - task.details = Some(Details::DocumentAdditionOrUpdate { - received_documents, - indexed_documents: Some(stats.document_count as u64), - }) } DocumentOperation::Delete(document_ids) => { - let count = document_ids.len(); let document_ids: bumpalo::collections::vec::Vec<_> = document_ids .iter() .map(|s| &*indexer_alloc.alloc_str(s)) .collect_in(&indexer_alloc); indexer.delete_documents(document_ids.into_bump_slice()); - // Uses Invariant: remove documents actually always returns Ok for the inner result - // let count = user_result.unwrap(); - let provided_ids = - if let Some(Details::DocumentDeletion { provided_ids, .. }) = - task.details - { - provided_ids - } else { - // In the case of a `documentAdditionOrUpdate` the details MUST be set - unreachable!(); - }; - - task.status = Status::Succeeded; - task.details = Some(Details::DocumentDeletion { - provided_ids, - deleted_documents: Some(count as u64), - }); } } } @@ -1394,8 +1358,7 @@ impl IndexScheduler { } }; - // TODO we want to multithread this - let document_changes = indexer.into_changes( + let (document_changes, operation_stats) = indexer.into_changes( &indexer_alloc, index, &rtxn, @@ -1403,6 +1366,36 @@ impl IndexScheduler { &mut new_fields_ids_map, )?; + for (stats, task) in operation_stats.into_iter().zip(&mut tasks) { + match stats.error { + Some(error) => { + task.status = Status::Failed; + task.error = Some(milli::Error::UserError(error).into()); + } + None => task.status = Status::Succeeded, + } + + task.details = match task.details { + Some(Details::DocumentAdditionOrUpdate { + received_documents, .. + }) => Some(Details::DocumentAdditionOrUpdate { + received_documents, + indexed_documents: Some(stats.document_count), + }), + Some(Details::DocumentDeletion { provided_ids, .. }) => { + Some(Details::DocumentDeletion { + provided_ids, + deleted_documents: Some(stats.document_count), + }) + } + _ => { + // In the case of a `documentAdditionOrUpdate` or `DocumentDeletion` + // the details MUST be set to either addition or deletion + unreachable!(); + } + } + } + pool.install(|| { indexer::index( index_wtxn, diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index a164d099e..0b586a795 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -1,10 +1,11 @@ use bumpalo::collections::CollectIn; use bumpalo::Bump; +use hashbrown::hash_map::Entry; use heed::RoTxn; use memmap2::Mmap; use rayon::slice::ParallelSlice; use serde_json::value::RawValue; -use IndexDocumentsMethod as Idm; +use serde_json::Deserializer; use super::super::document_change::DocumentChange; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; @@ -12,55 +13,24 @@ use crate::documents::PrimaryKey; use crate::update::new::document::Versions; use crate::update::new::{Deletion, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; -use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError}; +use crate::{DocumentId, Error, FieldsIdsMap, Index, InternalError, Result, UserError}; pub struct DocumentOperation<'pl> { operations: Vec>, - index_documents_method: IndexDocumentsMethod, -} - -pub struct DocumentOperationChanges<'pl> { - docids_version_offsets: &'pl [(&'pl str, ((u32, bool), &'pl [InnerDocOp<'pl>]))], - index_documents_method: IndexDocumentsMethod, -} - -pub enum Payload<'pl> { - Addition(&'pl [u8]), - Deletion(&'pl [&'pl str]), -} - -pub struct PayloadStats { - pub document_count: usize, - pub bytes: u64, -} - -#[derive(Clone)] -pub enum InnerDocOp<'pl> { - Addition(DocumentOffset<'pl>), - Deletion, -} - -/// Represents an offset where a document lives -/// in an mmapped grenad reader file. -#[derive(Clone)] -pub struct DocumentOffset<'pl> { - /// The mmapped payload files. - pub content: &'pl [u8], + method: MergeMethod, } impl<'pl> DocumentOperation<'pl> { pub fn new(method: IndexDocumentsMethod) -> Self { - Self { operations: Default::default(), index_documents_method: method } + Self { operations: Default::default(), method: MergeMethod::from(method) } } /// TODO please give me a type /// The payload is expected to be in the grenad format - pub fn add_documents(&mut self, payload: &'pl Mmap) -> Result { + pub fn add_documents(&mut self, payload: &'pl Mmap) -> Result<()> { payload.advise(memmap2::Advice::Sequential)?; - let document_count = - memchr::memmem::find_iter(&payload[..], "}{").count().saturating_add(1); self.operations.push(Payload::Addition(&payload[..])); - Ok(PayloadStats { bytes: payload.len() as u64, document_count }) + Ok(()) } pub fn delete_documents(&mut self, to_delete: &'pl [&'pl str]) { @@ -74,141 +44,239 @@ impl<'pl> DocumentOperation<'pl> { rtxn: &RoTxn, primary_key: &PrimaryKey, new_fields_ids_map: &mut FieldsIdsMap, - ) -> Result> { - // will contain nodes from the intermediate hashmap - let document_changes_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1 MiB + ) -> Result<(DocumentOperationChanges<'pl>, Vec)> { + let Self { operations, method } = self; let documents_ids = index.documents_ids(rtxn)?; + let mut operations_stats = Vec::new(); let mut available_docids = AvailableIds::new(&documents_ids); - let mut docids_version_offsets = - hashbrown::HashMap::<&'pl str, _, _, _>::new_in(&document_changes_alloc); + let mut docids_version_offsets = hashbrown::HashMap::new(); - for operation in self.operations { - match operation { - Payload::Addition(payload) => { - let mut iter = - serde_json::Deserializer::from_slice(payload).into_iter::<&RawValue>(); + for operation in operations { + let (bytes, document_count, result) = match operation { + Payload::Addition(payload) => extract_addition_payload_changes( + indexer, + index, + rtxn, + primary_key, + new_fields_ids_map, + &mut available_docids, + &docids_version_offsets, + method, + payload, + ), + Payload::Deletion(to_delete) => extract_deletion_payload_changes( + index, + rtxn, + &mut available_docids, + &docids_version_offsets, + method, + to_delete, + ), + }; - /// TODO manage the error - let mut previous_offset = 0; - while let Some(document) = - iter.next().transpose().map_err(UserError::SerdeJson)? - { - let external_document_id = primary_key.extract_fields_and_docid( - document, - new_fields_ids_map, - indexer, - )?; - - let external_document_id = external_document_id.to_de(); - - let current_offset = iter.byte_offset(); - let document_operation = InnerDocOp::Addition(DocumentOffset { - content: &payload[previous_offset..current_offset], - }); - - match docids_version_offsets.get_mut(external_document_id) { - None => { - let (docid, is_new) = match index - .external_documents_ids() - .get(rtxn, external_document_id)? - { - Some(docid) => (docid, false), - None => ( - available_docids.next().ok_or(Error::UserError( - UserError::DocumentLimitReached, - ))?, - true, - ), - }; - - docids_version_offsets.insert( - external_document_id, - ( - (docid, is_new), - bumpalo::vec![in indexer; document_operation], - ), - ); - } - Some((_, offsets)) => { - let useless_previous_addition = match self.index_documents_method { - IndexDocumentsMethod::ReplaceDocuments => { - MergeDocumentForReplacement::USELESS_PREVIOUS_CHANGES - } - IndexDocumentsMethod::UpdateDocuments => { - MergeDocumentForUpdates::USELESS_PREVIOUS_CHANGES - } - }; - - if useless_previous_addition { - offsets.clear(); - } - - offsets.push(document_operation); - } - } - - previous_offset = iter.byte_offset(); - } + let error = match result { + Ok(new_docids_version_offsets) => { + // If we don't have any error then we can merge the content of this payload + // into to main payload. Else we just drop this payload extraction. + merge_version_offsets(&mut docids_version_offsets, new_docids_version_offsets); + None } - Payload::Deletion(to_delete) => { - for external_document_id in to_delete { - match docids_version_offsets.get_mut(external_document_id) { - None => { - let (docid, is_new) = match index - .external_documents_ids() - .get(rtxn, external_document_id)? - { - Some(docid) => (docid, false), - None => ( - available_docids.next().ok_or(Error::UserError( - UserError::DocumentLimitReached, - ))?, - true, - ), - }; + Err(Error::UserError(user_error)) => Some(user_error), + Err(e) => return Err(e), + }; - docids_version_offsets.insert( - external_document_id, - ( - (docid, is_new), - bumpalo::vec![in indexer; InnerDocOp::Deletion], - ), - ); - } - Some((_, offsets)) => { - offsets.clear(); - offsets.push(InnerDocOp::Deletion); - } - } - } - } - } + operations_stats.push(PayloadStats { document_count, bytes, error }); } // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone - let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> = docids_version_offsets - .drain() - .map(|(item, (docid, v))| (item, (docid, v.into_bump_slice()))) - .collect_in(indexer); + let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> = + docids_version_offsets.drain().collect_in(indexer); + // Reorder the offsets to make sure we iterate on the file sequentially - let sort_function_key = match self.index_documents_method { - Idm::ReplaceDocuments => MergeDocumentForReplacement::sort_key, - Idm::UpdateDocuments => MergeDocumentForUpdates::sort_key, + // And finally sort them + docids_version_offsets.sort_unstable_by_key(|(_, po)| method.sort_key(&po.operations)); + + let docids_version_offsets = docids_version_offsets.into_bump_slice(); + Ok((DocumentOperationChanges { docids_version_offsets }, operations_stats)) + } +} + +fn extract_addition_payload_changes<'s, 'pl: 's>( + indexer: &'pl Bump, + index: &Index, + rtxn: &RoTxn, + primary_key: &PrimaryKey, + fields_ids_map: &mut FieldsIdsMap, + available_docids: &mut AvailableIds, + main_docids_version_offsets: &hashbrown::HashMap<&'s str, PayloadOperations<'pl>>, + method: MergeMethod, + payload: &'pl [u8], +) -> (u64, u64, Result>>) { + let mut new_docids_version_offsets = hashbrown::HashMap::<&str, PayloadOperations<'pl>>::new(); + + /// TODO manage the error + let mut previous_offset = 0; + let mut iter = Deserializer::from_slice(payload).into_iter::<&RawValue>(); + loop { + let doc = match iter.next().transpose() { + Ok(Some(doc)) => doc, + Ok(None) => break, + Err(e) => { + return ( + payload.len() as u64, + new_docids_version_offsets.len() as u64, + Err(InternalError::SerdeJson(e).into()), + ) + } }; - // And finally sort them - docids_version_offsets.sort_unstable_by_key(|(_, (_, docops))| sort_function_key(docops)); - let docids_version_offsets = docids_version_offsets.into_bump_slice(); - Ok(DocumentOperationChanges { - docids_version_offsets, - index_documents_method: self.index_documents_method, - }) + let external_id = match primary_key.extract_fields_and_docid(doc, fields_ids_map, indexer) { + Ok(edi) => edi, + Err(e) => { + return (payload.len() as u64, new_docids_version_offsets.len() as u64, Err(e)) + } + }; + + let external_id = external_id.to_de(); + let current_offset = iter.byte_offset(); + let document_offset = DocumentOffset { content: &payload[previous_offset..current_offset] }; + + match main_docids_version_offsets.get(external_id) { + None => { + let (docid, is_new) = match index.external_documents_ids().get(rtxn, external_id) { + Ok(Some(docid)) => (docid, false), + Ok(None) => ( + match available_docids.next() { + Some(docid) => docid, + None => { + return ( + payload.len() as u64, + new_docids_version_offsets.len() as u64, + Err(UserError::DocumentLimitReached.into()), + ) + } + }, + true, + ), + Err(e) => { + return ( + payload.len() as u64, + new_docids_version_offsets.len() as u64, + Err(e.into()), + ) + } + }; + + match new_docids_version_offsets.entry(external_id) { + Entry::Occupied(mut entry) => entry.get_mut().push_addition(document_offset), + Entry::Vacant(entry) => { + entry.insert(PayloadOperations::new_addition( + method, + docid, + is_new, + document_offset, + )); + } + } + } + Some(payload_operations) => match new_docids_version_offsets.entry(external_id) { + Entry::Occupied(mut entry) => entry.get_mut().push_addition(document_offset), + Entry::Vacant(entry) => { + entry.insert(PayloadOperations::new_addition( + method, + payload_operations.docid, + payload_operations.is_new, + document_offset, + )); + } + }, + } + + previous_offset = iter.byte_offset(); + } + + (payload.len() as u64, new_docids_version_offsets.len() as u64, Ok(new_docids_version_offsets)) +} + +fn extract_deletion_payload_changes<'s, 'pl: 's>( + index: &Index, + rtxn: &RoTxn, + available_docids: &mut AvailableIds, + main_docids_version_offsets: &hashbrown::HashMap<&'s str, PayloadOperations<'pl>>, + method: MergeMethod, + to_delete: &'pl [&'pl str], +) -> (u64, u64, Result>>) { + let mut new_docids_version_offsets = hashbrown::HashMap::<&str, PayloadOperations<'pl>>::new(); + let mut document_count = 0; + + for external_id in to_delete { + match main_docids_version_offsets.get(external_id) { + None => { + let (docid, is_new) = match index.external_documents_ids().get(rtxn, external_id) { + Ok(Some(docid)) => (docid, false), + Ok(None) => ( + match available_docids.next() { + Some(docid) => docid, + None => { + return ( + 0, + new_docids_version_offsets.len() as u64, + Err(UserError::DocumentLimitReached.into()), + ) + } + }, + true, + ), + Err(e) => return (0, new_docids_version_offsets.len() as u64, Err(e.into())), + }; + + match new_docids_version_offsets.entry(external_id) { + Entry::Occupied(mut entry) => entry.get_mut().push_deletion(), + Entry::Vacant(entry) => { + entry.insert(PayloadOperations::new_deletion(method, docid, is_new)); + } + } + } + Some(payload_operations) => match new_docids_version_offsets.entry(external_id) { + Entry::Occupied(mut entry) => entry.get_mut().push_deletion(), + Entry::Vacant(entry) => { + entry.insert(PayloadOperations::new_deletion( + method, + payload_operations.docid, + payload_operations.is_new, + )); + } + }, + } + document_count += 1; + } + + (0, document_count, Ok(new_docids_version_offsets)) +} + +fn merge_version_offsets<'s, 'pl>( + main: &mut hashbrown::HashMap<&'s str, PayloadOperations<'pl>>, + new: hashbrown::HashMap<&'s str, PayloadOperations<'pl>>, +) { + // We cannot swap like nothing because documents + // operations must be in the right order. + if main.is_empty() { + return *main = new; + } + + for (key, new_payload) in new { + match main.entry(key) { + Entry::Occupied(mut entry) => entry.get_mut().append_operations(new_payload.operations), + Entry::Vacant(entry) => { + entry.insert(new_payload); + } + } } } impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> { - type Item = (&'pl str, ((u32, bool), &'pl [InnerDocOp<'pl>])); + type Item = (&'pl str, PayloadOperations<'pl>); fn iter( &self, @@ -225,21 +293,14 @@ impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> { where 'pl: 'doc, { - let document_merge_function = match self.index_documents_method { - Idm::ReplaceDocuments => MergeDocumentForReplacement::merge, - Idm::UpdateDocuments => MergeDocumentForUpdates::merge, - }; - - let (external_doc, ((internal_docid, is_new), operations)) = *item; - - let change = document_merge_function( - internal_docid, + let (external_doc, payload_operations) = item; + payload_operations.merge_method.merge( + payload_operations.docid, external_doc, - is_new, + payload_operations.is_new, &context.doc_alloc, - operations, - )?; - Ok(change) + &payload_operations.operations[..], + ) } fn len(&self) -> usize { @@ -247,14 +308,92 @@ impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> { } } +pub struct DocumentOperationChanges<'pl> { + docids_version_offsets: &'pl [(&'pl str, PayloadOperations<'pl>)], +} + +pub enum Payload<'pl> { + Addition(&'pl [u8]), + Deletion(&'pl [&'pl str]), +} + +pub struct PayloadStats { + pub bytes: u64, + pub document_count: u64, + pub error: Option, +} + +pub struct PayloadOperations<'pl> { + /// The internal document id of the document. + pub docid: DocumentId, + /// Wether this document is not in the current database (visible by the rtxn). + pub is_new: bool, + /// The operations to perform, in order, on this document. + pub operations: Vec>, + /// The merge method we are using to merge payloads and documents. + merge_method: MergeMethod, +} + +impl<'pl> PayloadOperations<'pl> { + fn new_deletion(merge_method: MergeMethod, docid: DocumentId, is_new: bool) -> Self { + Self { docid, is_new, operations: vec![InnerDocOp::Deletion], merge_method } + } + + fn new_addition( + merge_method: MergeMethod, + docid: DocumentId, + is_new: bool, + offset: DocumentOffset<'pl>, + ) -> Self { + Self { docid, is_new, operations: vec![InnerDocOp::Addition(offset)], merge_method } + } +} + +impl<'pl> PayloadOperations<'pl> { + fn push_addition(&mut self, offset: DocumentOffset<'pl>) { + if self.merge_method.useless_previous_changes() { + self.operations.clear(); + } + self.operations.push(InnerDocOp::Addition(offset)) + } + + fn push_deletion(&mut self) { + self.operations.clear(); + self.operations.push(InnerDocOp::Deletion); + } + + fn append_operations(&mut self, mut operations: Vec>) { + debug_assert!(!operations.is_empty()); + if self.merge_method.useless_previous_changes() { + self.operations.clear(); + } + self.operations.append(&mut operations); + } +} + +#[derive(Clone)] +pub enum InnerDocOp<'pl> { + Addition(DocumentOffset<'pl>), + Deletion, +} + +/// Represents an offset where a document lives +/// in an mmapped grenad reader file. +#[derive(Clone)] +pub struct DocumentOffset<'pl> { + /// The mmapped payload files. + pub content: &'pl [u8], +} + trait MergeChanges { /// Whether the payloads in the list of operations are useless or not. - const USELESS_PREVIOUS_CHANGES: bool; + fn useless_previous_changes(&self) -> bool; /// Returns a key that is used to order the payloads the right way. - fn sort_key(docops: &[InnerDocOp]) -> usize; + fn sort_key(&self, docops: &[InnerDocOp]) -> usize; fn merge<'doc>( + &self, docid: DocumentId, external_docid: &'doc str, is_new: bool, @@ -263,13 +402,69 @@ trait MergeChanges { ) -> Result>>; } +#[derive(Debug, Clone, Copy)] +enum MergeMethod { + ForReplacement(MergeDocumentForReplacement), + ForUpdates(MergeDocumentForUpdates), +} + +impl MergeChanges for MergeMethod { + fn useless_previous_changes(&self) -> bool { + match self { + MergeMethod::ForReplacement(merge) => merge.useless_previous_changes(), + MergeMethod::ForUpdates(merge) => merge.useless_previous_changes(), + } + } + + fn sort_key(&self, docops: &[InnerDocOp]) -> usize { + match self { + MergeMethod::ForReplacement(merge) => merge.sort_key(docops), + MergeMethod::ForUpdates(merge) => merge.sort_key(docops), + } + } + + fn merge<'doc>( + &self, + docid: DocumentId, + external_docid: &'doc str, + is_new: bool, + doc_alloc: &'doc Bump, + operations: &'doc [InnerDocOp], + ) -> Result>> { + match self { + MergeMethod::ForReplacement(merge) => { + merge.merge(docid, external_docid, is_new, doc_alloc, operations) + } + MergeMethod::ForUpdates(merge) => { + merge.merge(docid, external_docid, is_new, doc_alloc, operations) + } + } + } +} + +impl From for MergeMethod { + fn from(method: IndexDocumentsMethod) -> Self { + match method { + IndexDocumentsMethod::ReplaceDocuments => { + MergeMethod::ForReplacement(MergeDocumentForReplacement) + } + IndexDocumentsMethod::UpdateDocuments => { + MergeMethod::ForUpdates(MergeDocumentForUpdates) + } + } + } +} + +#[derive(Debug, Clone, Copy)] struct MergeDocumentForReplacement; impl MergeChanges for MergeDocumentForReplacement { - const USELESS_PREVIOUS_CHANGES: bool = true; + fn useless_previous_changes(&self) -> bool { + true + } /// Reorders to read only the last change. - fn sort_key(docops: &[InnerDocOp]) -> usize { + fn sort_key(&self, docops: &[InnerDocOp]) -> usize { let f = |ido: &_| match ido { InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), InnerDocOp::Deletion => None, @@ -281,6 +476,7 @@ impl MergeChanges for MergeDocumentForReplacement { /// /// This function is only meant to be used when doing a replacement and not an update. fn merge<'doc>( + &self, docid: DocumentId, external_doc: &'doc str, is_new: bool, @@ -321,13 +517,16 @@ impl MergeChanges for MergeDocumentForReplacement { } } +#[derive(Debug, Clone, Copy)] struct MergeDocumentForUpdates; impl MergeChanges for MergeDocumentForUpdates { - const USELESS_PREVIOUS_CHANGES: bool = false; + fn useless_previous_changes(&self) -> bool { + false + } /// Reorders to read the first changes first so that it's faster to read the first one and then the rest. - fn sort_key(docops: &[InnerDocOp]) -> usize { + fn sort_key(&self, docops: &[InnerDocOp]) -> usize { let f = |ido: &_| match ido { InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), InnerDocOp::Deletion => None, @@ -340,6 +539,7 @@ impl MergeChanges for MergeDocumentForUpdates { /// /// This function is only meant to be used when doing an update and not a replacement. fn merge<'doc>( + &self, docid: DocumentId, external_docid: &'doc str, is_new: bool, diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 1a5e4fc23..0906c5b89 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -5,7 +5,7 @@ use std::thread::{self, Builder}; use big_s::S; use document_changes::{extract, DocumentChanges, IndexingContext, Progress, ThreadLocal}; pub use document_deletion::DocumentDeletion; -pub use document_operation::DocumentOperation; +pub use document_operation::{DocumentOperation, PayloadStats}; use hashbrown::HashMap; use heed::types::{Bytes, DecodeIgnore, Str}; use heed::{RoTxn, RwTxn}; From bd31ea217434fdc0d783af35d676a2d184822dfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 14 Nov 2024 16:13:38 +0100 Subject: [PATCH 231/247] Check for at least one valid task after setting their statuses --- crates/index-scheduler/src/batch.rs | 93 +++++++++++++++-------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index fb9cfbe6c..43c5e5df6 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -1229,9 +1229,7 @@ impl IndexScheduler { const PRINT_SECS_DELTA: u64 = 1; let processing_tasks = self.processing_tasks.clone(); - let must_stop_processing = self.must_stop_processing.clone(); - let send_progress = |progress| { let now = std::time::Instant::now(); let elapsed = secs_since_started_processing_at.load(atomic::Ordering::Relaxed); @@ -1327,6 +1325,7 @@ impl IndexScheduler { )? .map_err(milli::Error::from)?; + let indexer_config = self.index_mapper.indexer_config(); let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(method); let embedders = index.embedding_configs(index_wtxn)?; @@ -1348,54 +1347,56 @@ impl IndexScheduler { } } - if tasks.iter().any(|res| res.error.is_none()) { - let local_pool; - let pool = match &self.index_mapper.indexer_config().thread_pool { - Some(pool) => pool, - None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); - &local_pool - } - }; + let local_pool; + let pool = match &indexer_config.thread_pool { + Some(pool) => pool, + None => { + local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + &local_pool + } + }; - let (document_changes, operation_stats) = indexer.into_changes( - &indexer_alloc, - index, - &rtxn, - &primary_key, - &mut new_fields_ids_map, - )?; + let (document_changes, operation_stats) = indexer.into_changes( + &indexer_alloc, + index, + &rtxn, + &primary_key, + &mut new_fields_ids_map, + )?; - for (stats, task) in operation_stats.into_iter().zip(&mut tasks) { - match stats.error { - Some(error) => { - task.status = Status::Failed; - task.error = Some(milli::Error::UserError(error).into()); - } - None => task.status = Status::Succeeded, - } - - task.details = match task.details { - Some(Details::DocumentAdditionOrUpdate { - received_documents, .. - }) => Some(Details::DocumentAdditionOrUpdate { - received_documents, - indexed_documents: Some(stats.document_count), - }), - Some(Details::DocumentDeletion { provided_ids, .. }) => { - Some(Details::DocumentDeletion { - provided_ids, - deleted_documents: Some(stats.document_count), - }) - } - _ => { - // In the case of a `documentAdditionOrUpdate` or `DocumentDeletion` - // the details MUST be set to either addition or deletion - unreachable!(); - } + let mut addition = 0; + for (stats, task) in operation_stats.into_iter().zip(&mut tasks) { + addition += stats.document_count; + match stats.error { + Some(error) => { + task.status = Status::Failed; + task.error = Some(milli::Error::UserError(error).into()); } + None => task.status = Status::Succeeded, } + task.details = match task.details { + Some(Details::DocumentAdditionOrUpdate { received_documents, .. }) => { + Some(Details::DocumentAdditionOrUpdate { + received_documents, + indexed_documents: Some(stats.document_count), + }) + } + Some(Details::DocumentDeletion { provided_ids, .. }) => { + Some(Details::DocumentDeletion { + provided_ids, + deleted_documents: Some(stats.document_count), + }) + } + _ => { + // In the case of a `documentAdditionOrUpdate` or `DocumentDeletion` + // the details MUST be set to either addition or deletion + unreachable!(); + } + } + } + + if tasks.iter().any(|res| res.error.is_none()) { pool.install(|| { indexer::index( index_wtxn, @@ -1411,7 +1412,7 @@ impl IndexScheduler { }) .unwrap()?; - // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); + tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } // else if primary_key_has_been_set { // // Everything failed but we've set a primary key. From 677d7293f59cf6deb1ff2d25e40e2c6e0a7ace9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 14 Nov 2024 18:16:42 +0100 Subject: [PATCH 232/247] Fix a lot of primary key related tests --- crates/index-scheduler/src/batch.rs | 43 +-------- crates/index-scheduler/src/lib.rs | 9 +- .../update/new/indexer/document_operation.rs | 90 +++++++++++++++---- 3 files changed, 84 insertions(+), 58 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 43c5e5df6..0ebd2d120 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -32,9 +32,7 @@ use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; -use meilisearch_types::milli::update::new::indexer::{ - self, retrieve_or_guess_primary_key, UpdateByFunction, -}; +use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction}; use meilisearch_types::milli::update::{IndexDocumentsMethod, Settings as MilliSettings}; use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, @@ -43,7 +41,6 @@ use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbortBuilder}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; -use raw_collections::RawMap; use roaring::RoaringBitmap; use time::macros::format_description; use time::OffsetDateTime; @@ -1278,16 +1275,6 @@ impl IndexScheduler { // TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches. // this is made difficult by the fact we're doing private clones of the index scheduler and sending it // to a fresh thread. - - /// TODO manage errors correctly - let first_addition_uuid = operations - .iter() - .find_map(|op| match op { - DocumentOperation::Add(content_uuid) => Some(content_uuid), - _ => None, - }) - .unwrap(); - let mut content_files = Vec::new(); for operation in &operations { if let DocumentOperation::Add(content_uuid) = operation { @@ -1303,28 +1290,6 @@ impl IndexScheduler { let db_fields_ids_map = index.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); - let first_document = match content_files.first() { - Some(mmap) => { - let mut iter = serde_json::Deserializer::from_slice(mmap).into_iter(); - iter.next().transpose().map_err(|e| e.into()).map_err(Error::IoError)? - } - None => None, - }; - - let (primary_key, primary_key_has_been_set) = retrieve_or_guess_primary_key( - &rtxn, - index, - &mut new_fields_ids_map, - primary_key.as_deref(), - first_document - .map(|raw| RawMap::from_raw_value(raw, &indexer_alloc)) - .transpose() - .map_err(|error| { - milli::Error::UserError(milli::UserError::SerdeJson(error)) - })?, - )? - .map_err(milli::Error::from)?; - let indexer_config = self.index_mapper.indexer_config(); let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(method); @@ -1356,11 +1321,11 @@ impl IndexScheduler { } }; - let (document_changes, operation_stats) = indexer.into_changes( + let (document_changes, operation_stats, primary_key) = indexer.into_changes( &indexer_alloc, index, &rtxn, - &primary_key, + primary_key.as_deref(), &mut new_fields_ids_map, )?; @@ -1403,7 +1368,7 @@ impl IndexScheduler { index, &db_fields_ids_map, new_fields_ids_map, - primary_key_has_been_set.then_some(primary_key), + primary_key, &document_changes, embedders, &|| must_stop_processing.get(), diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index b57a0fe9f..83431d45c 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -4296,11 +4296,11 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed"); // The second batch should fail. - handle.advance_one_failed_batch(); + handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails"); // The second batch should fail. - handle.advance_one_failed_batch(); + handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_fails"); // Is the primary key still what we expect? @@ -4361,7 +4361,7 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed"); // The second batch should fail and contains two tasks. - handle.advance_one_failed_batch(); + handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_and_third_tasks_fails"); // Is the primary key still what we expect? @@ -4440,7 +4440,8 @@ mod tests { snapshot!(primary_key, @"id"); // We're trying to `bork` again, but now there is already a primary key set for this index. - handle.advance_one_failed_batch(); + // NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed. + handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "fourth_task_fails"); // Finally the last task should succeed since its primary key is the same as the valid one. diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 0b586a795..634a7f207 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -3,12 +3,14 @@ use bumpalo::Bump; use hashbrown::hash_map::Entry; use heed::RoTxn; use memmap2::Mmap; +use raw_collections::RawMap; use rayon::slice::ParallelSlice; use serde_json::value::RawValue; use serde_json::Deserializer; use super::super::document_change::DocumentChange; use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; +use super::retrieve_or_guess_primary_key; use crate::documents::PrimaryKey; use crate::update::new::document::Versions; use crate::update::new::{Deletion, Insertion, Update}; @@ -41,16 +43,17 @@ impl<'pl> DocumentOperation<'pl> { self, indexer: &'pl Bump, index: &Index, - rtxn: &RoTxn, - primary_key: &PrimaryKey, + rtxn: &'pl RoTxn<'pl>, + primary_key_from_op: Option<&'pl str>, new_fields_ids_map: &mut FieldsIdsMap, - ) -> Result<(DocumentOperationChanges<'pl>, Vec)> { + ) -> Result<(DocumentOperationChanges<'pl>, Vec, Option>)> { let Self { operations, method } = self; let documents_ids = index.documents_ids(rtxn)?; let mut operations_stats = Vec::new(); let mut available_docids = AvailableIds::new(&documents_ids); let mut docids_version_offsets = hashbrown::HashMap::new(); + let mut primary_key = None; for operation in operations { let (bytes, document_count, result) = match operation { @@ -58,7 +61,8 @@ impl<'pl> DocumentOperation<'pl> { indexer, index, rtxn, - primary_key, + primary_key_from_op, + &mut primary_key, new_fields_ids_map, &mut available_docids, &docids_version_offsets, @@ -98,30 +102,30 @@ impl<'pl> DocumentOperation<'pl> { docids_version_offsets.sort_unstable_by_key(|(_, po)| method.sort_key(&po.operations)); let docids_version_offsets = docids_version_offsets.into_bump_slice(); - Ok((DocumentOperationChanges { docids_version_offsets }, operations_stats)) + Ok((DocumentOperationChanges { docids_version_offsets }, operations_stats, primary_key)) } } -fn extract_addition_payload_changes<'s, 'pl: 's>( +fn extract_addition_payload_changes<'r, 'pl: 'r>( indexer: &'pl Bump, index: &Index, - rtxn: &RoTxn, - primary_key: &PrimaryKey, - fields_ids_map: &mut FieldsIdsMap, + rtxn: &'r RoTxn<'r>, + primary_key_from_op: Option<&'r str>, + primary_key: &mut Option>, + new_fields_ids_map: &mut FieldsIdsMap, available_docids: &mut AvailableIds, - main_docids_version_offsets: &hashbrown::HashMap<&'s str, PayloadOperations<'pl>>, + main_docids_version_offsets: &hashbrown::HashMap<&'pl str, PayloadOperations<'pl>>, method: MergeMethod, payload: &'pl [u8], -) -> (u64, u64, Result>>) { +) -> (u64, u64, Result>>) { let mut new_docids_version_offsets = hashbrown::HashMap::<&str, PayloadOperations<'pl>>::new(); /// TODO manage the error let mut previous_offset = 0; let mut iter = Deserializer::from_slice(payload).into_iter::<&RawValue>(); loop { - let doc = match iter.next().transpose() { - Ok(Some(doc)) => doc, - Ok(None) => break, + let optdoc = match iter.next().transpose() { + Ok(optdoc) => optdoc, Err(e) => { return ( payload.len() as u64, @@ -131,7 +135,63 @@ fn extract_addition_payload_changes<'s, 'pl: 's>( } }; - let external_id = match primary_key.extract_fields_and_docid(doc, fields_ids_map, indexer) { + // Only guess the primary key if it is the first document + let retrieved_primary_key = if previous_offset == 0 { + let optdoc = match optdoc { + Some(doc) => match RawMap::from_raw_value(doc, indexer) { + Ok(docmap) => Some(docmap), + Err(error) => { + return ( + payload.len() as u64, + new_docids_version_offsets.len() as u64, + Err(Error::UserError(UserError::SerdeJson(error))), + ) + } + }, + None => None, + }; + + let result = retrieve_or_guess_primary_key( + rtxn, + index, + new_fields_ids_map, + primary_key_from_op, + optdoc, + ); + + let (pk, _has_been_changed) = match result { + Ok(Ok(pk)) => pk, + Ok(Err(user_error)) => { + return ( + payload.len() as u64, + new_docids_version_offsets.len() as u64, + Err(Error::UserError(user_error)), + ) + } + Err(error) => { + return ( + payload.len() as u64, + new_docids_version_offsets.len() as u64, + Err(error), + ) + } + }; + + primary_key.get_or_insert(pk) + } else { + primary_key.as_ref().unwrap() + }; + + let doc = match optdoc { + Some(doc) => doc, + None => break, + }; + + let external_id = match retrieved_primary_key.extract_fields_and_docid( + doc, + new_fields_ids_map, + indexer, + ) { Ok(edi) => edi, Err(e) => { return (payload.len() as u64, new_docids_version_offsets.len() as u64, Err(e)) From c202f3dbe2c7b6d00c9dc217bf768695bd0c3004 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Nov 2024 10:13:04 +0100 Subject: [PATCH 233/247] fix tests and revert change in behavior when primary_key_from_op != primary_key_from_db && index.is_empty() --- crates/index-scheduler/src/lib.rs | 2 +- crates/milli/src/update/new/indexer/mod.rs | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 83431d45c..df8870470 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -4601,7 +4601,7 @@ mod tests { snapshot!(primary_key.is_none(), @"false"); // The second batch should contains only one task that fails because it tries to update the primary key to `bork`. - handle.advance_one_failed_batch(); + handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails"); // The third batch should succeed and only contains one task. diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 0906c5b89..00cc2d2c1 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -777,15 +777,9 @@ pub fn retrieve_or_guess_primary_key<'a>( match primary_key_from_op { // we did, and it is different from the DB one Some(primary_key_from_op) if primary_key_from_op != primary_key_from_db => { - // is the index empty? - if index.number_of_documents(rtxn)? == 0 { - // change primary key - (primary_key_from_op, true) - } else { - return Ok(Err(UserError::PrimaryKeyCannotBeChanged( - primary_key_from_db.to_string(), - ))); - } + return Ok(Err(UserError::PrimaryKeyCannotBeChanged( + primary_key_from_db.to_string(), + ))); } _ => (primary_key_from_db, false), } From 9150c8f0522bf6d3537b09e255cae5d05e409853 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Nov 2024 11:04:57 +0100 Subject: [PATCH 234/247] Accept changes to vector format --- crates/meilisearch/tests/search/multi.rs | 96 ++++++++++++------------ 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/crates/meilisearch/tests/search/multi.rs b/crates/meilisearch/tests/search/multi.rs index 942a87a79..8d7340f0d 100644 --- a/crates/meilisearch/tests/search/multi.rs +++ b/crates/meilisearch/tests/search/multi.rs @@ -585,9 +585,9 @@ async fn federation_two_search_two_indexes() { ], "_vectors": { "manual": [ - -100.0, - 340.0, - 90.0 + -100, + 340, + 90 ] }, "_federation": { @@ -613,9 +613,9 @@ async fn federation_two_search_two_indexes() { "cattos": "pésti", "_vectors": { "manual": [ - 1.0, - 2.0, - 3.0 + 1, + 2, + 3 ] }, "_federation": { @@ -640,9 +640,9 @@ async fn federation_two_search_two_indexes() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -707,9 +707,9 @@ async fn federation_multiple_search_multiple_indexes() { ], "_vectors": { "manual": [ - -100.0, - 340.0, - 90.0 + -100, + 340, + 90 ] }, "_federation": { @@ -735,9 +735,9 @@ async fn federation_multiple_search_multiple_indexes() { "cattos": "pésti", "_vectors": { "manual": [ - 1.0, - 2.0, - 3.0 + 1, + 2, + 3 ] }, "_federation": { @@ -773,9 +773,9 @@ async fn federation_multiple_search_multiple_indexes() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -793,9 +793,9 @@ async fn federation_multiple_search_multiple_indexes() { ], "_vectors": { "manual": [ - 10.0, - -23.0, - 32.0 + 10, + -23, + 32 ] }, "_federation": { @@ -824,9 +824,9 @@ async fn federation_multiple_search_multiple_indexes() { ], "_vectors": { "manual": [ - 10.0, - 23.0, - 32.0 + 10, + 23, + 32 ] }, "_federation": { @@ -869,9 +869,9 @@ async fn federation_multiple_search_multiple_indexes() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -898,9 +898,9 @@ async fn federation_multiple_search_multiple_indexes() { ], "_vectors": { "manual": [ - -100.0, - 231.0, - 32.0 + -100, + 231, + 32 ] }, "_federation": { @@ -1522,9 +1522,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { "cattos": "pésti", "_vectors": { "manual": [ - 1.0, - 2.0, - 3.0 + 1, + 2, + 3 ] }, "_federation": { @@ -1550,9 +1550,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -1582,9 +1582,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { ], "_vectors": { "manual": [ - 10.0, - 23.0, - 32.0 + 10, + 23, + 32 ] }, "_federation": { @@ -1845,9 +1845,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { ], "_vectors": { "manual": [ - 1.0, - 2.0, - 54.0 + 1, + 2, + 54 ] }, "_federation": { @@ -1874,9 +1874,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { "cattos": "pésti", "_vectors": { "manual": [ - 1.0, - 2.0, - 3.0 + 1, + 2, + 3 ] }, "_federation": { @@ -1906,9 +1906,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { ], "_vectors": { "manual": [ - 10.0, - 23.0, - 32.0 + 10, + 23, + 32 ] }, "_federation": { From 5b4c06c24cdc67c3c4dbf8575203302fd814ea6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 18 Nov 2024 11:25:37 +0100 Subject: [PATCH 235/247] Plug the grenad max memory parameter --- crates/index-scheduler/src/batch.rs | 11 +++++++--- .../index_documents/helpers/grenad_helpers.rs | 6 +----- crates/milli/src/update/indexer_config.rs | 12 +++++++++++ .../new/extract/faceted/extract_facets.rs | 2 +- .../milli/src/update/new/extract/geo/mod.rs | 2 +- .../extract/searchable/extract_word_docids.rs | 2 +- .../src/update/new/extract/searchable/mod.rs | 2 +- crates/milli/src/update/new/indexer/mod.rs | 20 +++++++------------ 8 files changed, 32 insertions(+), 25 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 0ebd2d120..1ad25b422 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -1290,7 +1290,6 @@ impl IndexScheduler { let db_fields_ids_map = index.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); - let indexer_config = self.index_mapper.indexer_config(); let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(method); let embedders = index.embedding_configs(index_wtxn)?; @@ -1313,6 +1312,7 @@ impl IndexScheduler { } let local_pool; + let indexer_config = self.index_mapper.indexer_config(); let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { @@ -1366,6 +1366,7 @@ impl IndexScheduler { indexer::index( index_wtxn, index, + indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, primary_key, @@ -1456,7 +1457,8 @@ impl IndexScheduler { if task.error.is_none() { let local_pool; - let pool = match &self.index_mapper.indexer_config().thread_pool { + let indexer_config = self.index_mapper.indexer_config(); + let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); @@ -1474,6 +1476,7 @@ impl IndexScheduler { indexer::index( index_wtxn, index, + indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, None, // cannot change primary key in DocumentEdition @@ -1606,7 +1609,8 @@ impl IndexScheduler { if !tasks.iter().all(|res| res.error.is_some()) { let local_pool; - let pool = match &self.index_mapper.indexer_config().thread_pool { + let indexer_config = self.index_mapper.indexer_config(); + let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); @@ -1624,6 +1628,7 @@ impl IndexScheduler { indexer::index( index_wtxn, index, + indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, None, // document deletion never changes primary key diff --git a/crates/milli/src/update/index_documents/helpers/grenad_helpers.rs b/crates/milli/src/update/index_documents/helpers/grenad_helpers.rs index b7da39878..62dc40edc 100644 --- a/crates/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/crates/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -119,12 +119,8 @@ impl GrenadParameters { /// /// This should be called inside of a rayon thread pool, /// otherwise, it will take the global number of threads. - /// - /// The max memory cannot exceed a given reasonable value. pub fn max_memory_by_thread(&self) -> Option { - self.max_memory.map(|max_memory| { - (max_memory / rayon::current_num_threads()).min(MAX_GRENAD_SORTER_USAGE) - }) + self.max_memory.map(|max_memory| (max_memory / rayon::current_num_threads())) } } diff --git a/crates/milli/src/update/indexer_config.rs b/crates/milli/src/update/indexer_config.rs index 115059a1d..6fb33ad78 100644 --- a/crates/milli/src/update/indexer_config.rs +++ b/crates/milli/src/update/indexer_config.rs @@ -1,5 +1,6 @@ use grenad::CompressionType; +use super::GrenadParameters; use crate::thread_pool_no_abort::ThreadPoolNoAbort; #[derive(Debug)] @@ -15,6 +16,17 @@ pub struct IndexerConfig { pub skip_index_budget: bool, } +impl IndexerConfig { + pub fn grenad_parameters(&self) -> GrenadParameters { + GrenadParameters { + chunk_compression_type: self.chunk_compression_type, + chunk_compression_level: self.chunk_compression_level, + max_memory: self.max_memory, + max_nb_chunks: self.max_nb_chunks, + } + } +} + impl Default for IndexerConfig { fn default() -> Self { Self { diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 19e908612..d30a50c52 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -36,7 +36,7 @@ impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> { fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { Ok(RefCell::new(BalancedCaches::new_in( self.buckets, - self.grenad_parameters.max_memory, + self.grenad_parameters.max_memory_by_thread(), extractor_alloc, ))) } diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index e26a7dc6c..e883a04cc 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -150,7 +150,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { ) -> Result<()> { let rtxn = &context.rtxn; let index = context.index; - let max_memory = self.grenad_parameters.max_memory; + let max_memory = self.grenad_parameters.max_memory_by_thread(); let db_fields_ids_map = context.db_fields_ids_map; let mut data_ref = context.data.borrow_mut_or_yield(); diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index c67fc347a..dfb55853f 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -214,7 +214,7 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in( self.buckets, - self.grenad_parameters.max_memory, + self.grenad_parameters.max_memory_by_thread(), extractor_alloc, )))) } diff --git a/crates/milli/src/update/new/extract/searchable/mod.rs b/crates/milli/src/update/new/extract/searchable/mod.rs index b75a01cd2..46a05be4e 100644 --- a/crates/milli/src/update/new/extract/searchable/mod.rs +++ b/crates/milli/src/update/new/extract/searchable/mod.rs @@ -36,7 +36,7 @@ impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { Ok(RefCell::new(BalancedCaches::new_in( self.buckets, - self.grenad_parameters.max_memory, + self.grenad_parameters.max_memory_by_thread(), extractor_alloc, ))) } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 00cc2d2c1..71fcdd204 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -132,6 +132,7 @@ mod steps { pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( wtxn: &mut RwTxn, index: &'index Index, + grenad_parameters: GrenadParameters, db_fields_ids_map: &'indexer FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, new_primary_key: Option>, @@ -209,16 +210,6 @@ where field_distribution.retain(|_, v| *v != 0); - const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; - let current_num_threads = rayon::current_num_threads(); - let max_memory = TEN_GIB / current_num_threads; - eprintln!("A maximum of {max_memory} bytes will be used for each of the {current_num_threads} threads"); - - let grenad_parameters = GrenadParameters { - max_memory: Some(max_memory), - ..GrenadParameters::default() - }; - let facet_field_ids_delta; { @@ -228,7 +219,8 @@ where let (finished_steps, step_name) = steps::extract_facets(); facet_field_ids_delta = merge_and_send_facet_docids( - FacetedDocidsExtractor::run_extraction(grenad_parameters, + FacetedDocidsExtractor::run_extraction( + grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, @@ -344,7 +336,8 @@ where let (finished_steps, step_name) = steps::extract_word_proximity(); - let caches = ::run_extraction(grenad_parameters, + let caches = ::run_extraction( + grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, @@ -398,7 +391,8 @@ where }; let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); let (finished_steps, step_name) = steps::extract_geo_points(); - extract(document_changes, + extract( + document_changes, &extractor, indexing_context, &mut extractor_allocs, From 6570da3bcbb62a3e49df4e343030e19947858773 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 14 Nov 2024 15:28:44 +0100 Subject: [PATCH 236/247] Retry in case where the JSON deserialization fails --- crates/milli/src/vector/error.rs | 2 +- crates/milli/src/vector/rest.rs | 29 +++++++++++++++++++---------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index 41765f6ab..97bbe5d68 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -60,7 +60,7 @@ pub enum EmbedErrorKind { ManualEmbed(String), #[error("model not found. Meilisearch will not automatically download models from the Ollama library, please pull the model manually{}", option_info(.0.as_deref(), "server replied with "))] OllamaModelNotFoundError(Option), - #[error("error deserialization the response body as JSON:\n - {0}")] + #[error("error deserializing the response body as JSON:\n - {0}")] RestResponseDeserialization(std::io::Error), #[error("expected a response containing {0} embeddings, got only {1}")] RestResponseEmbeddingCount(usize, usize), diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index eeb5b16af..81ca6598d 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -257,12 +257,12 @@ where for attempt in 0..10 { let response = request.clone().send_json(&body); - let result = check_response(response, data.configuration_source); + let result = check_response(response, data.configuration_source).and_then(|response| { + response_to_embedding(response, data, expected_count, expected_dimension) + }); let retry_duration = match result { - Ok(response) => { - return response_to_embedding(response, data, expected_count, expected_dimension) - } + Ok(response) => return Ok(response), Err(retry) => { tracing::warn!("Failed: {}", retry.error); retry.into_duration(attempt) @@ -283,6 +283,7 @@ where let result = check_response(response, data.configuration_source); result.map_err(Retry::into_error).and_then(|response| { response_to_embedding(response, data, expected_count, expected_dimension) + .map_err(Retry::into_error) }) } @@ -324,20 +325,28 @@ fn response_to_embedding( data: &EmbedderData, expected_count: usize, expected_dimensions: Option, -) -> Result, EmbedError> { - let response: serde_json::Value = - response.into_json().map_err(EmbedError::rest_response_deserialization)?; +) -> Result, Retry> { + let response: serde_json::Value = response + .into_json() + .map_err(EmbedError::rest_response_deserialization) + .map_err(Retry::retry_later)?; - let embeddings = data.response.extract_embeddings(response)?; + let embeddings = data.response.extract_embeddings(response).map_err(Retry::give_up)?; if embeddings.len() != expected_count { - return Err(EmbedError::rest_response_embedding_count(expected_count, embeddings.len())); + return Err(Retry::give_up(EmbedError::rest_response_embedding_count( + expected_count, + embeddings.len(), + ))); } if let Some(dimensions) = expected_dimensions { for embedding in &embeddings { if embedding.len() != dimensions { - return Err(EmbedError::rest_unexpected_dimension(dimensions, embedding.len())); + return Err(Retry::give_up(EmbedError::rest_unexpected_dimension( + dimensions, + embedding.len(), + ))); } } } From a05e448cf86b1883b63a3ed840388179bef11aeb Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 6 Nov 2024 09:25:41 +0100 Subject: [PATCH 237/247] Add test --- crates/meilisearch/tests/vector/openai.rs | 141 ++++++++++++++++++++-- 1 file changed, 130 insertions(+), 11 deletions(-) diff --git a/crates/meilisearch/tests/vector/openai.rs b/crates/meilisearch/tests/vector/openai.rs index 04c068c40..94291ebea 100644 --- a/crates/meilisearch/tests/vector/openai.rs +++ b/crates/meilisearch/tests/vector/openai.rs @@ -137,13 +137,14 @@ fn long_text() -> &'static str { } async fn create_mock_tokenized() -> (MockServer, Value) { - create_mock_with_template("{{doc.text}}", ModelDimensions::Large, false).await + create_mock_with_template("{{doc.text}}", ModelDimensions::Large, false, false).await } async fn create_mock_with_template( document_template: &str, model_dimensions: ModelDimensions, fallible: bool, + slow: bool, ) -> (MockServer, Value) { let mock_server = MockServer::start().await; const API_KEY: &str = "my-api-key"; @@ -154,7 +155,11 @@ async fn create_mock_with_template( Mock::given(method("POST")) .and(path("/")) .respond_with(move |req: &Request| { - // 0. maybe return 500 + // 0. wait for a long time + if slow { + std::thread::sleep(std::time::Duration::from_secs(1)); + } + // 1. maybe return 500 if fallible { let attempt = attempt.fetch_add(1, Ordering::Relaxed); let failed = matches!(attempt % 4, 0 | 1 | 3); @@ -167,7 +172,7 @@ async fn create_mock_with_template( })) } } - // 1. check API key + // 3. check API key match req.headers.get("Authorization") { Some(api_key) if api_key == API_KEY_BEARER => { {} @@ -202,7 +207,7 @@ async fn create_mock_with_template( ) } } - // 2. parse text inputs + // 3. parse text inputs let query: serde_json::Value = match req.body_json() { Ok(query) => query, Err(_error) => return ResponseTemplate::new(400).set_body_json( @@ -223,7 +228,7 @@ async fn create_mock_with_template( panic!("Expected {model_dimensions:?}, got {query_model_dimensions:?}") } - // 3. for each text, find embedding in responses + // 4. for each text, find embedding in responses let serde_json::Value::Array(inputs) = &query["input"] else { panic!("Unexpected `input` value") }; @@ -283,7 +288,7 @@ async fn create_mock_with_template( "embedding": embedding, })).collect(); - // 4. produce output from embeddings + // 5. produce output from embeddings ResponseTemplate::new(200).set_body_json(json!({ "object": "list", "data": data, @@ -317,23 +322,27 @@ const DOGGO_TEMPLATE: &str = r#"{%- if doc.gender == "F" -%}Une chienne nommée {%- endif %}, de race {{doc.breed}}."#; async fn create_mock() -> (MockServer, Value) { - create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, false).await + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, false, false).await } async fn create_mock_dimensions() -> (MockServer, Value) { - create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large512, false).await + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large512, false, false).await } async fn create_mock_small_embedding_model() -> (MockServer, Value) { - create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Small, false).await + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Small, false, false).await } async fn create_mock_legacy_embedding_model() -> (MockServer, Value) { - create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Ada, false).await + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Ada, false, false).await } async fn create_fallible_mock() -> (MockServer, Value) { - create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true).await + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true, false).await +} + +async fn create_slow_mock() -> (MockServer, Value) { + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true, true).await } // basic test "it works" @@ -1873,4 +1882,114 @@ async fn it_still_works() { ] "###); } + +// test with a server that responds 500 on 3 out of 4 calls +#[actix_rt::test] +async fn timeout() { + let (_mock, setting) = create_slow_mock().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents, {".results.*._vectors.default.embeddings" => "[vector]"}), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 0.99, "embedder": "default"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["semanticHitCount"]), @"0"); + snapshot!(json_string!(response["hits"]), @"[]"); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 0.99, "embedder": "default"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["semanticHitCount"]), @"1"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 0.99, "embedder": "default"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["semanticHitCount"]), @"0"); + snapshot!(json_string!(response["hits"]), @"[]"); +} + // test with a server that wrongly responds 400 From e9d17136b29d7e6145d6cd8c11dcda2bb3431aad Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 6 Nov 2024 09:24:51 +0100 Subject: [PATCH 238/247] Add deadline of 3 seconds to embedding requests made in the context of hybrid search --- crates/index-scheduler/src/lib.rs | 7 +++-- crates/meilisearch/src/search/mod.rs | 4 ++- crates/milli/src/search/hybrid.rs | 4 ++- crates/milli/src/vector/mod.rs | 21 +++++++++---- crates/milli/src/vector/ollama.rs | 9 ++++-- crates/milli/src/vector/openai.rs | 21 ++++++++----- crates/milli/src/vector/rest.rs | 45 +++++++++++++++++++++------- 7 files changed, 80 insertions(+), 31 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index df8870470..16ad3f194 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -5214,9 +5214,10 @@ mod tests { let configs = index_scheduler.embedders(configs).unwrap(); let (hf_embedder, _, _) = configs.get(&simple_hf_name).unwrap(); - let beagle_embed = hf_embedder.embed_one(S("Intel the beagle best doggo")).unwrap(); - let lab_embed = hf_embedder.embed_one(S("Max the lab best doggo")).unwrap(); - let patou_embed = hf_embedder.embed_one(S("kefir the patou best doggo")).unwrap(); + let beagle_embed = + hf_embedder.embed_one(S("Intel the beagle best doggo"), None).unwrap(); + let lab_embed = hf_embedder.embed_one(S("Max the lab best doggo"), None).unwrap(); + let patou_embed = hf_embedder.embed_one(S("kefir the patou best doggo"), None).unwrap(); (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) }; diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index c873ab387..ec36b01bb 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -796,8 +796,10 @@ fn prepare_search<'t>( let span = tracing::trace_span!(target: "search::vector", "embed_one"); let _entered = span.enter(); + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(10); + embedder - .embed_one(query.q.clone().unwrap()) + .embed_one(query.q.clone().unwrap(), Some(deadline)) .map_err(milli::vector::Error::from) .map_err(milli::Error::from)? } diff --git a/crates/milli/src/search/hybrid.rs b/crates/milli/src/search/hybrid.rs index 8b274804c..5187b572b 100644 --- a/crates/milli/src/search/hybrid.rs +++ b/crates/milli/src/search/hybrid.rs @@ -201,7 +201,9 @@ impl<'a> Search<'a> { let span = tracing::trace_span!(target: "search::hybrid", "embed_one"); let _entered = span.enter(); - match embedder.embed_one(query) { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(3); + + match embedder.embed_one(query, Some(deadline)) { Ok(embedding) => embedding, Err(error) => { tracing::error!(error=%error, "Embedding failed"); diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 24ea77541..3047e6dfc 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::sync::Arc; +use std::time::Instant; use arroy::distances::{BinaryQuantizedCosine, Cosine}; use arroy::ItemId; @@ -595,18 +596,26 @@ impl Embedder { /// Embed one or multiple texts. /// /// Each text can be embedded as one or multiple embeddings. - pub fn embed(&self, texts: Vec) -> std::result::Result, EmbedError> { + pub fn embed( + &self, + texts: Vec, + deadline: Option, + ) -> std::result::Result, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed(texts), - Embedder::OpenAi(embedder) => embedder.embed(&texts), - Embedder::Ollama(embedder) => embedder.embed(&texts), + Embedder::OpenAi(embedder) => embedder.embed(&texts, deadline), + Embedder::Ollama(embedder) => embedder.embed(&texts, deadline), Embedder::UserProvided(embedder) => embedder.embed(&texts), - Embedder::Rest(embedder) => embedder.embed(texts), + Embedder::Rest(embedder) => embedder.embed(texts, deadline), } } - pub fn embed_one(&self, text: String) -> std::result::Result { - let mut embedding = self.embed(vec![text])?; + pub fn embed_one( + &self, + text: String, + deadline: Option, + ) -> std::result::Result { + let mut embedding = self.embed(vec![text], deadline)?; let embedding = embedding.pop().ok_or_else(EmbedError::missing_embedding)?; Ok(embedding) } diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index 263d9d3c9..7ee775cbf 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -1,3 +1,5 @@ +use std::time::Instant; + use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; use rayon::slice::ParallelSlice as _; @@ -80,8 +82,9 @@ impl Embedder { pub fn embed + serde::Serialize>( &self, texts: &[S], + deadline: Option, ) -> Result, EmbedError> { - match self.rest_embedder.embed_ref(texts) { + match self.rest_embedder.embed_ref(texts, deadline) { Ok(embeddings) => Ok(embeddings), Err(EmbedError { kind: EmbedErrorKind::RestOtherStatusCode(404, error), fault: _ }) => { Err(EmbedError::ollama_model_not_found(error)) @@ -97,7 +100,7 @@ impl Embedder { ) -> Result>, EmbedError> { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk)).collect() + text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), @@ -114,7 +117,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk)) + .map(move |chunk| self.embed(chunk, None)) .collect(); let embeddings = embeddings?; diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index 375b2878a..7262bfef8 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -1,3 +1,5 @@ +use std::time::Instant; + use ordered_float::OrderedFloat; use rayon::iter::{IntoParallelIterator, ParallelIterator as _}; use rayon::slice::ParallelSlice as _; @@ -211,18 +213,23 @@ impl Embedder { pub fn embed + serde::Serialize>( &self, texts: &[S], + deadline: Option, ) -> Result, EmbedError> { - match self.rest_embedder.embed_ref(texts) { + match self.rest_embedder.embed_ref(texts, deadline) { Ok(embeddings) => Ok(embeddings), Err(EmbedError { kind: EmbedErrorKind::RestBadRequest(error, _), fault: _ }) => { tracing::warn!(error=?error, "OpenAI: received `BAD_REQUEST`. Input was maybe too long, retrying on tokenized version. For best performance, limit the size of your document template."); - self.try_embed_tokenized(texts) + self.try_embed_tokenized(texts, deadline) } Err(error) => Err(error), } } - fn try_embed_tokenized>(&self, text: &[S]) -> Result, EmbedError> { + fn try_embed_tokenized>( + &self, + text: &[S], + deadline: Option, + ) -> Result, EmbedError> { let mut all_embeddings = Vec::with_capacity(text.len()); for text in text { let text = text.as_ref(); @@ -230,13 +237,13 @@ impl Embedder { let encoded = self.tokenizer.encode_ordinary(text); let len = encoded.len(); if len < max_token_count { - all_embeddings.append(&mut self.rest_embedder.embed_ref(&[text])?); + all_embeddings.append(&mut self.rest_embedder.embed_ref(&[text], deadline)?); continue; } let tokens = &encoded.as_slice()[0..max_token_count]; - let embedding = self.rest_embedder.embed_tokens(tokens)?; + let embedding = self.rest_embedder.embed_tokens(tokens, deadline)?; all_embeddings.push(embedding); } @@ -250,7 +257,7 @@ impl Embedder { ) -> Result>, EmbedError> { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk)).collect() + text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), @@ -267,7 +274,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk)) + .map(move |chunk| self.embed(chunk, None)) .collect(); let embeddings = embeddings?; diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index 81ca6598d..98be311d4 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -1,4 +1,5 @@ use std::collections::BTreeMap; +use std::time::Instant; use deserr::Deserr; use rand::Rng; @@ -153,19 +154,31 @@ impl Embedder { Ok(Self { data, dimensions, distribution: options.distribution }) } - pub fn embed(&self, texts: Vec) -> Result, EmbedError> { - embed(&self.data, texts.as_slice(), texts.len(), Some(self.dimensions)) + pub fn embed( + &self, + texts: Vec, + deadline: Option, + ) -> Result, EmbedError> { + embed(&self.data, texts.as_slice(), texts.len(), Some(self.dimensions), deadline) } - pub fn embed_ref(&self, texts: &[S]) -> Result, EmbedError> + pub fn embed_ref( + &self, + texts: &[S], + deadline: Option, + ) -> Result, EmbedError> where S: AsRef + Serialize, { - embed(&self.data, texts, texts.len(), Some(self.dimensions)) + embed(&self.data, texts, texts.len(), Some(self.dimensions), deadline) } - pub fn embed_tokens(&self, tokens: &[usize]) -> Result { - let mut embeddings = embed(&self.data, tokens, 1, Some(self.dimensions))?; + pub fn embed_tokens( + &self, + tokens: &[usize], + deadline: Option, + ) -> Result { + let mut embeddings = embed(&self.data, tokens, 1, Some(self.dimensions), deadline)?; // unwrap: guaranteed that embeddings.len() == 1, otherwise the previous line terminated in error Ok(embeddings.pop().unwrap()) } @@ -177,7 +190,7 @@ impl Embedder { ) -> Result>, EmbedError> { threads .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() + text_chunks.into_par_iter().map(move |chunk| self.embed(chunk, None)).collect() }) .map_err(|error| EmbedError { kind: EmbedErrorKind::PanicInThreadPool(error), @@ -194,7 +207,7 @@ impl Embedder { .install(move || { let embeddings: Result>, _> = texts .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed_ref(chunk)) + .map(move |chunk| self.embed_ref(chunk, None)) .collect(); let embeddings = embeddings?; @@ -227,7 +240,7 @@ impl Embedder { } fn infer_dimensions(data: &EmbedderData) -> Result { - let v = embed(data, ["test"].as_slice(), 1, None) + let v = embed(data, ["test"].as_slice(), 1, None, None) .map_err(NewEmbedderError::could_not_determine_dimension)?; // unwrap: guaranteed that v.len() == 1, otherwise the previous line terminated in error Ok(v.first().unwrap().len()) @@ -238,6 +251,7 @@ fn embed( inputs: &[S], expected_count: usize, expected_dimension: Option, + deadline: Option, ) -> Result, EmbedError> where S: Serialize, @@ -265,7 +279,18 @@ where Ok(response) => return Ok(response), Err(retry) => { tracing::warn!("Failed: {}", retry.error); - retry.into_duration(attempt) + if let Some(deadline) = deadline { + let now = std::time::Instant::now(); + if now > deadline { + tracing::warn!("Could not embed due to deadline"); + return Err(retry.into_error()); + } + + let duration_to_deadline = deadline - now; + retry.into_duration(attempt).map(|duration| duration.min(duration_to_deadline)) + } else { + retry.into_duration(attempt) + } } }?; From e736a74729c3e7353b2af5d40e839a6a7fe61236 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Nov 2024 12:41:51 +0100 Subject: [PATCH 239/247] Remove infinite loop in `import_vectors` --- crates/milli/src/update/new/document.rs | 30 +++++++++++++------------ 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index ddf508ad7..ae9aa9de9 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -70,28 +70,30 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { fn iter_top_level_fields(&self) -> impl Iterator> { let mut it = self.content.iter(); - std::iter::from_fn(move || { + std::iter::from_fn(move || loop { let (fid, value) = it.next()?; + let name = match self.fields_ids_map.name(fid).ok_or( + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "getting current document", + }), + ) { + Ok(name) => name, + Err(error) => return Some(Err(error.into())), + }; - let res = (|| loop { - let name = self.fields_ids_map.name(fid).ok_or( - InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId { - field_id: fid, - process: "getting current document", - }), - )?; - - if name == RESERVED_VECTORS_FIELD_NAME || name == "_geo" { - continue; - } + if name == RESERVED_VECTORS_FIELD_NAME || name == "_geo" { + continue; + } + let res = (|| { let value = serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?; - return Ok((name, value)); + Ok((name, value)) })(); - Some(res) + return Some(res); }) } From 1f8b01a59818af24c60f803c129e2cfa394f2c4f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Nov 2024 12:42:03 +0100 Subject: [PATCH 240/247] Fix snap since `_vectors` is no longer part of the field distributions --- .../lib.rs/import_vectors/Intel to kefir succeeds.snap | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap index 41cfcfdab..fed7be6e9 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap @@ -1,5 +1,5 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/lib.rs --- ### Autobatching Enabled = true ### Processing Tasks: @@ -22,7 +22,7 @@ succeeded [0,1,2,] doggos [0,1,2,] ---------------------------------------------------------------------- ### Index Mapper: -doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } +doggos: { number_of_documents: 1, field_distribution: {"breed": 1, "doggo": 1, "id": 1} } ---------------------------------------------------------------------- ### Canceled By: From 0a21d9bfb3f4f0bb83982f12ce1349f86d2ff501 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Nov 2024 15:56:01 +0100 Subject: [PATCH 241/247] Fix double borrow of new fields id map --- crates/milli/src/update/new/extract/documents.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index a324d2914..c0a2e3d6a 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -43,10 +43,12 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc); let mut document_extractor_data = context.data.0.borrow_mut_or_yield(); - let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); - for change in changes { let change = change?; + // **WARNING**: the exclusive borrow on `new_fields_ids_map` needs to be taken **inside** of the `for change in changes` loop + // Otherwise, `BorrowMutError` will occur for document changes that also need the new_fields_ids_map (e.g.: UpdateByFunction) + let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); + let external_docid = change.external_docid().to_owned(); // document but we need to create a function that collects and compresses documents. From 5f93651cefceccec3fb26d757c141493f2892674 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Nov 2024 16:23:11 +0100 Subject: [PATCH 242/247] fixes --- crates/index-scheduler/src/batch.rs | 20 ++++++++-------- crates/index-scheduler/src/lib.rs | 2 ++ .../meilisearch-types/src/document_formats.rs | 5 ++-- .../new/extract/faceted/extract_facets.rs | 2 ++ .../update/new/indexer/document_operation.rs | 1 + crates/milli/src/update/new/ref_cell_ext.rs | 23 +------------------ 6 files changed, 20 insertions(+), 33 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 1ad25b422..d94c8d9a5 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -1395,15 +1395,17 @@ impl IndexScheduler { Ok(tasks) } IndexOperation::DocumentEdition { mut task, .. } => { - let (filter, context, code) = - if let KindWithContent::DocumentEdition { - filter_expr, context, function, .. - } = &task.kind - { - (filter_expr, context, function) - } else { - unreachable!() - }; + let (filter, code) = if let KindWithContent::DocumentEdition { + filter_expr, + context: _, + function, + .. + } = &task.kind + { + (filter_expr, function) + } else { + unreachable!() + }; let candidates = match filter.as_ref().map(Filter::from_json) { Some(Ok(Some(filter))) => { diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 16ad3f194..5508eabab 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -971,6 +971,8 @@ impl IndexScheduler { let ProcessingTasks { started_at, processing, progress, .. } = self.processing_tasks.read().map_err(|_| Error::CorruptedTaskQueue)?.clone(); + let _ = progress; + let ret = tasks.into_iter(); if processing.is_empty() { Ok((ret.collect(), total)) diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index db893f880..311fcccf4 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -220,11 +220,12 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { let mut out = BufWriter::new(output); let mut deserializer = serde_json::Deserializer::from_slice(&input); - let count = match array_each(&mut deserializer, |obj: &RawValue| { + let res = array_each(&mut deserializer, |obj: &RawValue| { doc_alloc.reset(); let map = RawMap::from_raw_value(obj, &doc_alloc)?; to_writer(&mut out, &map) - }) { + }); + let count = match res { // The json data has been deserialized and does not need to be processed again. // The data has been transferred to the writer during the deserialization process. Ok(Ok(count)) => count, diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index d30a50c52..14b1b1bdd 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -156,6 +156,7 @@ impl FacetedDocidsExtractor { res } + #[allow(clippy::too_many_arguments)] fn facet_fn_with_options<'extractor, 'doc>( doc_alloc: &'doc Bump, cached_sorter: &mut BalancedCaches<'extractor>, @@ -336,6 +337,7 @@ fn truncate_str(s: &str) -> &str { } impl FacetedDocidsExtractor { + #[allow(clippy::too_many_arguments)] #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] pub fn run_extraction< 'pl, diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 634a7f207..604dd1786 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -106,6 +106,7 @@ impl<'pl> DocumentOperation<'pl> { } } +#[allow(clippy::too_many_arguments)] fn extract_addition_payload_changes<'r, 'pl: 'r>( indexer: &'pl Bump, index: &Index, diff --git a/crates/milli/src/update/new/ref_cell_ext.rs b/crates/milli/src/update/new/ref_cell_ext.rs index b147c00e5..c66f4af0a 100644 --- a/crates/milli/src/update/new/ref_cell_ext.rs +++ b/crates/milli/src/update/new/ref_cell_ext.rs @@ -1,37 +1,16 @@ -use std::cell::{Ref, RefCell, RefMut}; +use std::cell::{RefCell, RefMut}; pub trait RefCellExt { - fn try_borrow_or_yield(&self) -> std::result::Result, std::cell::BorrowError>; fn try_borrow_mut_or_yield( &self, ) -> std::result::Result, std::cell::BorrowMutError>; - fn borrow_or_yield(&self) -> Ref<'_, T> { - self.try_borrow_or_yield().unwrap() - } - fn borrow_mut_or_yield(&self) -> RefMut<'_, T> { self.try_borrow_mut_or_yield().unwrap() } } impl RefCellExt for RefCell { - fn try_borrow_or_yield(&self) -> std::result::Result, std::cell::BorrowError> { - /// TODO: move this trait and impl elsewhere - loop { - match self.try_borrow() { - Ok(borrow) => break Ok(borrow), - Err(error) => { - tracing::warn!("dynamic borrow failed, yielding to local tasks"); - match rayon::yield_local() { - Some(rayon::Yield::Executed) => continue, - _ => return Err(error), - } - } - } - } - } - fn try_borrow_mut_or_yield( &self, ) -> std::result::Result, std::cell::BorrowMutError> { From 04c38220cac3ea13557fa18d30aaf6a6dd47d52a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Nov 2024 16:43:05 +0100 Subject: [PATCH 243/247] Move MostlySend, ThreadLocal, FullySend to their own commit --- crates/milli/src/update/new/extract/cache.rs | 2 +- .../milli/src/update/new/extract/documents.rs | 3 +- .../new/extract/faceted/extract_facets.rs | 4 +- .../milli/src/update/new/extract/geo/mod.rs | 3 +- crates/milli/src/update/new/extract/mod.rs | 5 +- .../extract/searchable/extract_word_docids.rs | 4 +- .../src/update/new/extract/searchable/mod.rs | 4 +- .../src/update/new/extract/vectors/mod.rs | 3 +- .../update/new/indexer/document_changes.rs | 175 +----------------- .../update/new/indexer/document_deletion.rs | 6 +- .../update/new/indexer/document_operation.rs | 3 +- crates/milli/src/update/new/indexer/mod.rs | 3 +- .../src/update/new/indexer/partial_dump.rs | 3 +- .../update/new/indexer/update_by_function.rs | 3 +- crates/milli/src/update/new/mod.rs | 1 + crates/milli/src/update/new/thread_local.rs | 174 +++++++++++++++++ 16 files changed, 203 insertions(+), 193 deletions(-) create mode 100644 crates/milli/src/update/new/thread_local.rs diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index dd43feefb..9c864372d 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -79,7 +79,7 @@ use roaring::RoaringBitmap; use rustc_hash::FxBuildHasher; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; -use crate::update::new::indexer::document_changes::MostlySend; +use crate::update::new::thread_local::MostlySend; use crate::update::new::KvReaderDelAdd; use crate::update::MergeDeladdCboRoaringBitmaps; use crate::{CboRoaringBitmapCodec, Result}; diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index c0a2e3d6a..23d93a2c2 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -6,8 +6,9 @@ use hashbrown::HashMap; use super::DelAddRoaringBitmap; use crate::update::new::channel::DocumentsSender; use crate::update::new::document::{write_to_obkv, Document as _}; -use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor, FullySend}; +use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::thread_local::FullySend; use crate::update::new::DocumentChange; use crate::vector::EmbeddingConfigs; use crate::Result; diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 14b1b1bdd..acf211d63 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -15,10 +15,10 @@ use crate::heed_codec::facet::OrderedF64Codec; use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, - Progress, ThreadLocal, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, }; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index e883a04cc..c3ea76c42 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -11,8 +11,9 @@ use serde_json::Value; use crate::error::GeoError; use crate::update::new::document::Document; -use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor, MostlySend}; +use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::thread_local::MostlySend; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Result}; diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index 14cfa83cb..3b2bd77ce 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -13,9 +13,8 @@ pub use geo::*; pub use searchable::*; pub use vectors::EmbeddingExtractor; -use super::indexer::document_changes::{ - DocumentChanges, FullySend, IndexingContext, Progress, ThreadLocal, -}; +use super::indexer::document_changes::{DocumentChanges, IndexingContext, Progress}; +use super::thread_local::{FullySend, ThreadLocal}; use crate::update::GrenadParameters; use crate::Result; diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index dfb55853f..9822570d0 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -11,10 +11,10 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, - MostlySend, Progress, ThreadLocal, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, }; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; diff --git a/crates/milli/src/update/new/extract/searchable/mod.rs b/crates/milli/src/update/new/extract/searchable/mod.rs index 46a05be4e..2a9078d6e 100644 --- a/crates/milli/src/update/new/extract/searchable/mod.rs +++ b/crates/milli/src/update/new/extract/searchable/mod.rs @@ -14,9 +14,9 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::BalancedCaches; use super::DocidsExtractor; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, - Progress, ThreadLocal, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, }; +use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE}; diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 2fb717c71..8ac73a8d7 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -8,7 +8,8 @@ use super::cache::DelAddRoaringBitmap; use crate::error::FaultSource; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; -use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor, MostlySend}; +use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; +use crate::update::new::thread_local::MostlySend; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; use crate::vector::error::{ diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index e4b088f31..308582002 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -8,182 +8,9 @@ use rayon::iter::IndexedParallelIterator; use super::super::document_change::DocumentChange; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; +use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result}; -/// A trait for types that are **not** [`Send`] only because they would then allow concurrent access to a type that is not [`Sync`]. -/// -/// The primary example of such a type is `&T`, with `T: !Sync`. -/// -/// In the authors' understanding, a type can be `!Send` for two distinct reasons: -/// -/// 1. Because it contains data that *genuinely* cannot be moved between threads, such as thread-local data. -/// 2. Because sending the type would allow concurrent access to a `!Sync` type, which is undefined behavior. -/// -/// `MostlySend` exists to be used in bounds where you need a type whose data is **not** *attached* to a thread -/// because you might access it from a different thread, but where you will never access the type **concurrently** from -/// multiple threads. -/// -/// Like [`Send`], `MostlySend` assumes properties on types that cannot be verified by the compiler, which is why implementing -/// this trait is unsafe. -/// -/// # Safety -/// -/// Implementers of this trait promises that the following properties hold on the implementing type: -/// -/// 1. Its data can be accessed from any thread and will be the same regardless of the thread accessing it. -/// 2. Any operation that can be performed on the type does not depend on the thread that executes it. -/// -/// As these properties are subtle and are not generally tracked by the Rust type system, great care should be taken before -/// implementing `MostlySend` on a type, especially a foreign type. -/// -/// - An example of a type that verifies (1) and (2) is [`std::rc::Rc`] (when `T` is `Send` and `Sync`). -/// - An example of a type that doesn't verify (1) is thread-local data. -/// - An example of a type that doesn't verify (2) is [`std::sync::MutexGuard`]: a lot of mutex implementations require that -/// a lock is returned to the operating system on the same thread that initially locked the mutex, failing to uphold this -/// invariant will cause Undefined Behavior -/// (see last § in [the nomicon](https://doc.rust-lang.org/nomicon/send-and-sync.html)). -/// -/// It is **always safe** to implement this trait on a type that is `Send`, but no placeholder impl is provided due to limitations in -/// coherency. Use the [`FullySend`] wrapper in this situation. -pub unsafe trait MostlySend {} - -#[derive(Debug, Clone, Copy, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] -pub struct FullySend(pub T); - -// SAFETY: a type **fully** send is always mostly send as well. -unsafe impl MostlySend for FullySend where T: Send {} - -unsafe impl MostlySend for RefCell where T: MostlySend {} - -unsafe impl MostlySend for Option where T: MostlySend {} - -impl FullySend { - pub fn into(self) -> T { - self.0 - } -} - -impl From for FullySend { - fn from(value: T) -> Self { - Self(value) - } -} - -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] -struct MostlySendWrapper(T); - -impl MostlySendWrapper { - /// # Safety - /// - /// - (P1) Users of this type will never access the type concurrently from multiple threads without synchronization - unsafe fn new(t: T) -> Self { - Self(t) - } - - fn as_ref(&self) -> &T { - &self.0 - } - - fn as_mut(&mut self) -> &mut T { - &mut self.0 - } - - fn into_inner(self) -> T { - self.0 - } -} - -/// # Safety -/// -/// 1. `T` is [`MostlySend`], so by its safety contract it can be accessed by any thread and all of its operations are available -/// from any thread. -/// 2. (P1) of `MostlySendWrapper::new` forces the user to never access the value from multiple threads concurrently. -unsafe impl Send for MostlySendWrapper {} - -/// A wrapper around [`thread_local::ThreadLocal`] that accepts [`MostlySend`] `T`s. -#[derive(Default)] -pub struct ThreadLocal { - inner: thread_local::ThreadLocal>, - // FIXME: this should be necessary - //_no_send: PhantomData<*mut ()>, -} - -impl ThreadLocal { - pub fn new() -> Self { - Self { inner: thread_local::ThreadLocal::new() } - } - - pub fn with_capacity(capacity: usize) -> Self { - Self { inner: thread_local::ThreadLocal::with_capacity(capacity) } - } - - pub fn clear(&mut self) { - self.inner.clear() - } - - pub fn get(&self) -> Option<&T> { - self.inner.get().map(|t| t.as_ref()) - } - - pub fn get_or(&self, create: F) -> &T - where - F: FnOnce() -> T, - { - /// TODO: move ThreadLocal, MostlySend, FullySend to a dedicated file - self.inner.get_or(|| unsafe { MostlySendWrapper::new(create()) }).as_ref() - } - - pub fn get_or_try(&self, create: F) -> std::result::Result<&T, E> - where - F: FnOnce() -> std::result::Result, - { - self.inner - .get_or_try(|| unsafe { Ok(MostlySendWrapper::new(create()?)) }) - .map(MostlySendWrapper::as_ref) - } - - pub fn get_or_default(&self) -> &T - where - T: Default, - { - self.inner.get_or_default().as_ref() - } - - pub fn iter_mut(&mut self) -> IterMut { - IterMut(self.inner.iter_mut()) - } -} - -impl IntoIterator for ThreadLocal { - type Item = T; - - type IntoIter = IntoIter; - - fn into_iter(self) -> Self::IntoIter { - IntoIter(self.inner.into_iter()) - } -} - -pub struct IterMut<'a, T: MostlySend>(thread_local::IterMut<'a, MostlySendWrapper>); - -impl<'a, T: MostlySend> Iterator for IterMut<'a, T> { - type Item = &'a mut T; - - fn next(&mut self) -> Option { - self.0.next().map(|t| t.as_mut()) - } -} - -pub struct IntoIter(thread_local::IntoIter>); - -impl Iterator for IntoIter { - type Item = T; - - fn next(&mut self) -> Option { - self.0.next().map(|t| t.into_inner()) - } -} - pub struct DocumentChangeContext< 'doc, // covariant lifetime of a single `process` call 'extractor: 'doc, // invariant lifetime of the extractor_allocs diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index 353995a59..2e46be63d 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -4,8 +4,9 @@ use rayon::iter::IndexedParallelIterator; use rayon::slice::ParallelSlice as _; use roaring::RoaringBitmap; -use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; +use super::document_changes::{DocumentChangeContext, DocumentChanges}; use crate::documents::PrimaryKey; +use crate::update::new::thread_local::MostlySend; use crate::update::new::{Deletion, DocumentChange}; use crate::{DocumentId, Result}; @@ -92,9 +93,10 @@ mod test { use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::tests::TempIndex; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, Extractor, IndexingContext, MostlySend, ThreadLocal, + extract, DocumentChangeContext, Extractor, IndexingContext, }; use crate::update::new::indexer::DocumentDeletion; + use crate::update::new::thread_local::{MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::DocumentId; diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 604dd1786..71d410ea6 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -9,10 +9,11 @@ use serde_json::value::RawValue; use serde_json::Deserializer; use super::super::document_change::DocumentChange; -use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; +use super::document_changes::{DocumentChangeContext, DocumentChanges}; use super::retrieve_or_guess_primary_key; use crate::documents::PrimaryKey; use crate::update::new::document::Versions; +use crate::update::new::thread_local::MostlySend; use crate::update::new::{Deletion, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, InternalError, Result, UserError}; diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 71fcdd204..0511dd6a1 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -3,7 +3,7 @@ use std::sync::{OnceLock, RwLock}; use std::thread::{self, Builder}; use big_s::S; -use document_changes::{extract, DocumentChanges, IndexingContext, Progress, ThreadLocal}; +use document_changes::{extract, DocumentChanges, IndexingContext, Progress}; pub use document_deletion::DocumentDeletion; pub use document_operation::{DocumentOperation, PayloadStats}; use hashbrown::HashMap; @@ -20,6 +20,7 @@ use super::channel::*; use super::extract::*; use super::facet_search_builder::FacetSearchBuilder; use super::merger::FacetFieldIdsDelta; +use super::thread_local::ThreadLocal; use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder}; use super::words_prefix_docids::{ compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index 2da047824..8b5a8b650 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -3,11 +3,12 @@ use std::ops::DerefMut; use rayon::iter::IndexedParallelIterator; use serde_json::value::RawValue; -use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend}; +use super::document_changes::{DocumentChangeContext, DocumentChanges}; use crate::documents::PrimaryKey; use crate::update::concurrent_available_ids::ConcurrentAvailableIds; use crate::update::new::document::Versions; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::thread_local::MostlySend; use crate::update::new::{DocumentChange, Insertion}; use crate::{Error, InternalError, Result, UserError}; diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index f6df3981d..a8e3e38a8 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -4,13 +4,14 @@ use rayon::slice::ParallelSlice as _; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use roaring::RoaringBitmap; -use super::document_changes::{DocumentChangeContext, MostlySend}; +use super::document_changes::DocumentChangeContext; use super::DocumentChanges; use crate::documents::Error::InvalidDocumentFormat; use crate::documents::PrimaryKey; use crate::error::{FieldIdMapMissingEntry, InternalError}; use crate::update::new::document::Versions; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::thread_local::MostlySend; use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update}; use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError}; diff --git a/crates/milli/src/update/new/mod.rs b/crates/milli/src/update/new/mod.rs index 7a749228e..edbbdf497 100644 --- a/crates/milli/src/update/new/mod.rs +++ b/crates/milli/src/update/new/mod.rs @@ -17,6 +17,7 @@ pub mod indexer; mod merger; mod parallel_iterator_ext; mod ref_cell_ext; +pub(crate) mod thread_local; mod top_level_map; pub mod vector_document; mod word_fst_builder; diff --git a/crates/milli/src/update/new/thread_local.rs b/crates/milli/src/update/new/thread_local.rs new file mode 100644 index 000000000..acdc78c7b --- /dev/null +++ b/crates/milli/src/update/new/thread_local.rs @@ -0,0 +1,174 @@ +use std::cell::RefCell; + +/// A trait for types that are **not** [`Send`] only because they would then allow concurrent access to a type that is not [`Sync`]. +/// +/// The primary example of such a type is `&T`, with `T: !Sync`. +/// +/// In the authors' understanding, a type can be `!Send` for two distinct reasons: +/// +/// 1. Because it contains data that *genuinely* cannot be moved between threads, such as thread-local data. +/// 2. Because sending the type would allow concurrent access to a `!Sync` type, which is undefined behavior. +/// +/// `MostlySend` exists to be used in bounds where you need a type whose data is **not** *attached* to a thread +/// because you might access it from a different thread, but where you will never access the type **concurrently** from +/// multiple threads. +/// +/// Like [`Send`], `MostlySend` assumes properties on types that cannot be verified by the compiler, which is why implementing +/// this trait is unsafe. +/// +/// # Safety +/// +/// Implementers of this trait promises that the following properties hold on the implementing type: +/// +/// 1. Its data can be accessed from any thread and will be the same regardless of the thread accessing it. +/// 2. Any operation that can be performed on the type does not depend on the thread that executes it. +/// +/// As these properties are subtle and are not generally tracked by the Rust type system, great care should be taken before +/// implementing `MostlySend` on a type, especially a foreign type. +/// +/// - An example of a type that verifies (1) and (2) is [`std::rc::Rc`] (when `T` is `Send` and `Sync`). +/// - An example of a type that doesn't verify (1) is thread-local data. +/// - An example of a type that doesn't verify (2) is [`std::sync::MutexGuard`]: a lot of mutex implementations require that +/// a lock is returned to the operating system on the same thread that initially locked the mutex, failing to uphold this +/// invariant will cause Undefined Behavior +/// (see last § in [the nomicon](https://doc.rust-lang.org/nomicon/send-and-sync.html)). +/// +/// It is **always safe** to implement this trait on a type that is `Send`, but no placeholder impl is provided due to limitations in +/// coherency. Use the [`FullySend`] wrapper in this situation. +pub unsafe trait MostlySend {} + +#[derive(Debug, Clone, Copy, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct FullySend(pub T); + +// SAFETY: a type **fully** send is always mostly send as well. +unsafe impl MostlySend for FullySend where T: Send {} + +unsafe impl MostlySend for RefCell where T: MostlySend {} + +unsafe impl MostlySend for Option where T: MostlySend {} + +impl FullySend { + pub fn into(self) -> T { + self.0 + } +} + +impl From for FullySend { + fn from(value: T) -> Self { + Self(value) + } +} + +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct MostlySendWrapper(T); + +impl MostlySendWrapper { + /// # Safety + /// + /// - (P1) Users of this type will never access the type concurrently from multiple threads without synchronization + unsafe fn new(t: T) -> Self { + Self(t) + } + + fn as_ref(&self) -> &T { + &self.0 + } + + fn as_mut(&mut self) -> &mut T { + &mut self.0 + } + + fn into_inner(self) -> T { + self.0 + } +} + +/// # Safety +/// +/// 1. `T` is [`MostlySend`], so by its safety contract it can be accessed by any thread and all of its operations are available +/// from any thread. +/// 2. (P1) of `MostlySendWrapper::new` forces the user to never access the value from multiple threads concurrently. +unsafe impl Send for MostlySendWrapper {} + +/// A wrapper around [`thread_local::ThreadLocal`] that accepts [`MostlySend`] `T`s. +#[derive(Default)] +pub struct ThreadLocal { + inner: thread_local::ThreadLocal>, + // FIXME: this should be necessary + //_no_send: PhantomData<*mut ()>, +} + +impl ThreadLocal { + pub fn new() -> Self { + Self { inner: thread_local::ThreadLocal::new() } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { inner: thread_local::ThreadLocal::with_capacity(capacity) } + } + + pub fn clear(&mut self) { + self.inner.clear() + } + + pub fn get(&self) -> Option<&T> { + self.inner.get().map(|t| t.as_ref()) + } + + pub fn get_or(&self, create: F) -> &T + where + F: FnOnce() -> T, + { + self.inner.get_or(|| unsafe { MostlySendWrapper::new(create()) }).as_ref() + } + + pub fn get_or_try(&self, create: F) -> std::result::Result<&T, E> + where + F: FnOnce() -> std::result::Result, + { + self.inner + .get_or_try(|| unsafe { Ok(MostlySendWrapper::new(create()?)) }) + .map(MostlySendWrapper::as_ref) + } + + pub fn get_or_default(&self) -> &T + where + T: Default, + { + self.inner.get_or_default().as_ref() + } + + pub fn iter_mut(&mut self) -> IterMut { + IterMut(self.inner.iter_mut()) + } +} + +impl IntoIterator for ThreadLocal { + type Item = T; + + type IntoIter = IntoIter; + + fn into_iter(self) -> Self::IntoIter { + IntoIter(self.inner.into_iter()) + } +} + +pub struct IterMut<'a, T: MostlySend>(thread_local::IterMut<'a, MostlySendWrapper>); + +impl<'a, T: MostlySend> Iterator for IterMut<'a, T> { + type Item = &'a mut T; + + fn next(&mut self) -> Option { + self.0.next().map(|t| t.as_mut()) + } +} + +pub struct IntoIter(thread_local::IntoIter>); + +impl Iterator for IntoIter { + type Item = T; + + fn next(&mut self) -> Option { + self.0.next().map(|t| t.into_inner()) + } +} From 75943a5a9b35608d1a3d1eaa43ecfd443a678271 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Nov 2024 17:40:51 +0100 Subject: [PATCH 244/247] Add TODO to remember replacing steps with an enum --- crates/milli/src/update/new/indexer/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 0511dd6a1..8998780fb 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -68,6 +68,7 @@ mod steps { ]; const fn step(step: u16) -> (u16, &'static str) { + /// TODO: convert to an enum_iterator enum of steps (step, STEPS[step as usize]) } From c782c09208f3d250ff237ab3537a10e52ad52945 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 18 Nov 2024 18:22:13 +0100 Subject: [PATCH 245/247] Move step to a dedicated mod and replace it with an enum --- Cargo.lock | 1 + crates/milli/Cargo.toml | 1 + .../new/extract/faceted/extract_facets.rs | 10 +- crates/milli/src/update/new/extract/mod.rs | 5 +- .../extract/searchable/extract_word_docids.rs | 9 +- .../src/update/new/extract/searchable/mod.rs | 17 +- .../src/update/new/facet_search_builder.rs | 7 +- .../update/new/indexer/document_changes.rs | 45 +++-- .../update/new/indexer/document_deletion.rs | 5 +- crates/milli/src/update/new/indexer/mod.rs | 154 ++---------------- crates/milli/src/update/new/mod.rs | 1 + crates/milli/src/update/new/steps.rs | 45 +++++ 12 files changed, 111 insertions(+), 189 deletions(-) create mode 100644 crates/milli/src/update/new/steps.rs diff --git a/Cargo.lock b/Cargo.lock index fd01352a9..e78372421 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3619,6 +3619,7 @@ dependencies = [ "csv", "deserr", "either", + "enum-iterator", "filter-parser", "flatten-serde-json", "fst", diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 622292e8a..07e18ef4d 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -101,6 +101,7 @@ thread_local = "1.1.8" allocator-api2 = "0.2.18" rustc-hash = "2.0.0" uell = "0.1.0" +enum-iterator = "2.1.0" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index acf211d63..5394a6e86 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -18,6 +18,7 @@ use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, }; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::steps::Step; use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; @@ -337,7 +338,6 @@ fn truncate_str(s: &str) -> &str { } impl FacetedDocidsExtractor { - #[allow(clippy::too_many_arguments)] #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] pub fn run_extraction< 'pl, @@ -354,9 +354,7 @@ impl FacetedDocidsExtractor { indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &'extractor mut ThreadLocal>, sender: &FieldIdDocidFacetSender, - finished_steps: u16, - total_steps: u16, - step_name: &'static str, + step: Step, ) -> Result>> where MSP: Fn() -> bool + Sync, @@ -386,9 +384,7 @@ impl FacetedDocidsExtractor { indexing_context, extractor_allocs, &datastore, - finished_steps, - total_steps, - step_name, + step, )?; } diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index 3b2bd77ce..7364434ee 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -14,6 +14,7 @@ pub use searchable::*; pub use vectors::EmbeddingExtractor; use super::indexer::document_changes::{DocumentChanges, IndexingContext, Progress}; +use super::steps::Step; use super::thread_local::{FullySend, ThreadLocal}; use crate::update::GrenadParameters; use crate::Result; @@ -24,9 +25,7 @@ pub trait DocidsExtractor { document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &'extractor mut ThreadLocal>, - finished_steps: u16, - total_steps: u16, - step_name: &'static str, + step: Step, ) -> Result>> where MSP: Fn() -> bool + Sync, diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 9822570d0..f3d4afcb8 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -14,6 +14,7 @@ use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, }; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::steps::Step; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; @@ -249,9 +250,7 @@ impl WordDocidsExtractors { document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &'extractor mut ThreadLocal>, - finished_steps: u16, - total_steps: u16, - step_name: &'static str, + step: Step, ) -> Result> where MSP: Fn() -> bool + Sync, @@ -306,9 +305,7 @@ impl WordDocidsExtractors { indexing_context, extractor_allocs, &datastore, - finished_steps, - total_steps, - step_name, + step, )?; } diff --git a/crates/milli/src/update/new/extract/searchable/mod.rs b/crates/milli/src/update/new/extract/searchable/mod.rs index 2a9078d6e..b61dfcf92 100644 --- a/crates/milli/src/update/new/extract/searchable/mod.rs +++ b/crates/milli/src/update/new/extract/searchable/mod.rs @@ -16,6 +16,7 @@ use super::DocidsExtractor; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, }; +use crate::update::new::steps::Step; use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; @@ -60,9 +61,7 @@ pub trait SearchableExtractor: Sized + Sync { document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &'extractor mut ThreadLocal>, - finished_steps: u16, - total_steps: u16, - step_name: &'static str, + step: Step, ) -> Result>> where MSP: Fn() -> bool + Sync, @@ -115,9 +114,7 @@ pub trait SearchableExtractor: Sized + Sync { indexing_context, extractor_allocs, &datastore, - finished_steps, - total_steps, - step_name, + step, )?; } @@ -142,9 +139,7 @@ impl DocidsExtractor for T { document_changes: &DC, indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &'extractor mut ThreadLocal>, - finished_steps: u16, - total_steps: u16, - step_name: &'static str, + step: Step, ) -> Result>> where MSP: Fn() -> bool + Sync, @@ -155,9 +150,7 @@ impl DocidsExtractor for T { document_changes, indexing_context, extractor_allocs, - finished_steps, - total_steps, - step_name, + step, ) } } diff --git a/crates/milli/src/update/new/facet_search_builder.rs b/crates/milli/src/update/new/facet_search_builder.rs index 0c924bff4..39e04a589 100644 --- a/crates/milli/src/update/new/facet_search_builder.rs +++ b/crates/milli/src/update/new/facet_search_builder.rs @@ -1,3 +1,4 @@ +use std::collections::hash_map::Entry; use std::collections::{BTreeSet, HashMap}; use charabia::normalizer::NormalizerOption; @@ -83,9 +84,9 @@ impl<'indexer> FacetSearchBuilder<'indexer> { } fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> { - if !self.localized_field_ids.contains_key(&field_id) { + if let Entry::Vacant(e) = self.localized_field_ids.entry(field_id) { let Some(field_name) = self.global_fields_ids_map.name(field_id) else { - unreachable!("Field id {} not found in the global fields ids map", field_id); + unreachable!("Field id {field_id} not found in the global fields ids map"); }; let locales = self @@ -94,7 +95,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { .find(|rule| rule.match_str(field_name)) .map(|rule| rule.locales.clone()); - self.localized_field_ids.insert(field_id, locales); + e.insert(locales); } self.localized_field_ids.get(&field_id).unwrap().as_deref() diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index 308582002..4efebc586 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -8,6 +8,7 @@ use rayon::iter::IndexedParallelIterator; use super::super::document_change::DocumentChange; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; +use crate::update::new::steps::Step; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result}; @@ -191,7 +192,6 @@ where const CHUNK_SIZE: usize = 100; -#[allow(clippy::too_many_arguments)] pub fn extract< 'pl, // covariant lifetime of the underlying payload 'extractor, // invariant lifetime of extractor_alloc @@ -217,9 +217,7 @@ pub fn extract< }: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, extractor_allocs: &'extractor mut ThreadLocal>, datastore: &'data ThreadLocal, - finished_steps: u16, - total_steps: u16, - step_name: &'static str, + step: Step, ) -> Result<()> where EX: Extractor<'extractor>, @@ -233,7 +231,7 @@ where extractor_alloc.0.reset(); } - let total_documents = document_changes.len(); + let total_documents = document_changes.len() as u32; let pi = document_changes.iter(CHUNK_SIZE); pi.enumerate().try_arc_for_each_try_init( @@ -253,14 +251,13 @@ where if (must_stop_processing)() { return Err(Arc::new(InternalError::AbortedIndexation.into())); } - let finished_documents = finished_documents * CHUNK_SIZE; + let finished_documents = (finished_documents * CHUNK_SIZE) as u32; - (send_progress)(Progress { - finished_steps, - total_steps, - step_name, - finished_total_documents: Some((finished_documents as u32, total_documents as u32)), - }); + (send_progress)(Progress::from_step_documents( + step, + finished_documents, + total_documents, + )); // Clean up and reuse the document-specific allocator context.doc_alloc.reset(); @@ -279,12 +276,7 @@ where }, )?; - (send_progress)(Progress { - finished_steps, - total_steps, - step_name, - finished_total_documents: Some((total_documents as u32, total_documents as u32)), - }); + (send_progress)(Progress::from_step_documents(step, total_documents, total_documents)); Ok(()) } @@ -295,3 +287,20 @@ pub struct Progress { pub step_name: &'static str, pub finished_total_documents: Option<(u32, u32)>, } + +impl Progress { + pub fn from_step(step: Step) -> Self { + Self { + finished_steps: step.finished_steps(), + total_steps: Step::total_steps(), + step_name: step.name(), + finished_total_documents: None, + } + } + pub fn from_step_documents(step: Step, finished_documents: u32, total_documents: u32) -> Self { + Self { + finished_total_documents: Some((finished_documents, total_documents)), + ..Progress::from_step(step) + } + } +} diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index 2e46be63d..fe3f08583 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -96,6 +96,7 @@ mod test { extract, DocumentChangeContext, Extractor, IndexingContext, }; use crate::update::new::indexer::DocumentDeletion; + use crate::update::new::steps::Step; use crate::update::new::thread_local::{MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::DocumentId; @@ -175,9 +176,7 @@ mod test { context, &mut extractor_allocs, &datastore, - 0, - 1, - "test", + Step::ExtractingDocuments, ) .unwrap(); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 8998780fb..dfc3d9b02 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -20,6 +20,7 @@ use super::channel::*; use super::extract::*; use super::facet_search_builder::FacetSearchBuilder; use super::merger::FacetFieldIdsDelta; +use super::steps::Step; use super::thread_local::ThreadLocal; use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder}; use super::words_prefix_docids::{ @@ -51,80 +52,6 @@ mod document_operation; mod partial_dump; mod update_by_function; -mod steps { - pub const STEPS: &[&str] = &[ - "extracting documents", - "extracting facets", - "extracting words", - "extracting word proximity", - "extracting embeddings", - "writing geo points", - "writing to database", - "writing embeddings to database", - "waiting for extractors", - "post-processing facets", - "post-processing words", - "finalizing", - ]; - - const fn step(step: u16) -> (u16, &'static str) { - /// TODO: convert to an enum_iterator enum of steps - (step, STEPS[step as usize]) - } - - pub const fn total_steps() -> u16 { - STEPS.len() as u16 - } - - pub const fn extract_documents() -> (u16, &'static str) { - step(0) - } - - pub const fn extract_facets() -> (u16, &'static str) { - step(1) - } - - pub const fn extract_words() -> (u16, &'static str) { - step(2) - } - - pub const fn extract_word_proximity() -> (u16, &'static str) { - step(3) - } - - pub const fn extract_embeddings() -> (u16, &'static str) { - step(4) - } - - pub const fn extract_geo_points() -> (u16, &'static str) { - step(5) - } - - pub const fn write_db() -> (u16, &'static str) { - step(6) - } - - pub const fn write_embedding_db() -> (u16, &'static str) { - step(7) - } - - pub const fn waiting_extractors() -> (u16, &'static str) { - step(8) - } - - pub const fn post_processing_facets() -> (u16, &'static str) { - step(9) - } - - pub const fn post_processing_words() -> (u16, &'static str) { - step(10) - } - - pub const fn finalizing() -> (u16, &'static str) { - step(11) - } -} - /// This is the main function of this crate. /// /// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`]. @@ -167,8 +94,6 @@ where send_progress, }; - let total_steps = steps::total_steps(); - let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; @@ -189,15 +114,13 @@ where let document_sender = extractor_sender.documents(); let document_extractor = DocumentsExtractor::new(&document_sender, embedders); let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - let (finished_steps, step_name) = steps::extract_documents(); + extract(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore, - finished_steps, - total_steps, - step_name, + Step::ExtractingDocuments, )?; for document_extractor_data in datastore { @@ -218,8 +141,6 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); let _entered = span.enter(); - let (finished_steps, step_name) = steps::extract_facets(); - facet_field_ids_delta = merge_and_send_facet_docids( FacetedDocidsExtractor::run_extraction( grenad_parameters, @@ -227,9 +148,7 @@ where indexing_context, &mut extractor_allocs, &extractor_sender.field_id_docid_facet_sender(), - finished_steps, - total_steps, - step_name, + Step::ExtractingFacets )?, FacetDatabases::new(index), index, @@ -240,7 +159,7 @@ where { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); let _entered = span.enter(); - let (finished_steps, step_name) = steps::extract_words(); + let WordDocidsCaches { word_docids, @@ -253,9 +172,7 @@ where document_changes, indexing_context, &mut extractor_allocs, - finished_steps, - total_steps, - step_name, + Step::ExtractingWords )?; // TODO Word Docids Merger @@ -336,16 +253,13 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); let _entered = span.enter(); - let (finished_steps, step_name) = steps::extract_word_proximity(); let caches = ::run_extraction( grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, - finished_steps, - total_steps, - step_name, + Step::ExtractingWordProximity, )?; merge_and_send_docids( @@ -369,8 +283,7 @@ where let embedding_sender = extractor_sender.embeddings(); let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - let (finished_steps, step_name) = steps::extract_embeddings(); - extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; + extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, Step::ExtractingEmbeddings)?; for config in &mut index_embeddings { 'data: for data in datastore.iter_mut() { @@ -392,16 +305,13 @@ where break 'geo; }; let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - let (finished_steps, step_name) = steps::extract_geo_points(); extract( document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, - finished_steps, - total_steps, - step_name, + Step::WritingGeoPoints )?; merge_and_send_rtree( @@ -431,8 +341,7 @@ where { let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); let _entered = span.enter(); - let (finished_steps, step_name) = steps::write_db(); - (indexing_context.send_progress)(Progress { finished_steps, total_steps, step_name, finished_total_documents: None }); + (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); } Result::Ok(facet_field_ids_delta) @@ -513,13 +422,9 @@ where let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); let _entered = span.enter(); - let (finished_steps, step_name) = steps::write_embedding_db(); - (indexing_context.send_progress)(Progress { - finished_steps, - total_steps, - step_name, - finished_total_documents: None, - }); + (indexing_context.send_progress)(Progress::from_step( + Step::WritingEmbeddingsToDatabase, + )); for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers @@ -540,46 +445,21 @@ where } } - let (finished_steps, step_name) = steps::waiting_extractors(); - (indexing_context.send_progress)(Progress { - finished_steps, - total_steps, - step_name, - finished_total_documents: None, - }); + (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); let facet_field_ids_delta = extractor_handle.join().unwrap()?; - let (finished_steps, step_name) = steps::post_processing_facets(); - (indexing_context.send_progress)(Progress { - finished_steps, - total_steps, - step_name, - finished_total_documents: None, - }); + (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets)); compute_facet_search_database(index, wtxn, global_fields_ids_map)?; compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; - let (finished_steps, step_name) = steps::post_processing_words(); - (indexing_context.send_progress)(Progress { - finished_steps, - total_steps, - step_name, - finished_total_documents: None, - }); + (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingWords)); if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { compute_prefix_database(index, wtxn, prefix_delta)?; } - - let (finished_steps, step_name) = steps::finalizing(); - (indexing_context.send_progress)(Progress { - finished_steps, - total_steps, - step_name, - finished_total_documents: None, - }); + (indexing_context.send_progress)(Progress::from_step(Step::Finalizing)); Ok(()) as Result<_> })?; diff --git a/crates/milli/src/update/new/mod.rs b/crates/milli/src/update/new/mod.rs index edbbdf497..140f4ccf0 100644 --- a/crates/milli/src/update/new/mod.rs +++ b/crates/milli/src/update/new/mod.rs @@ -17,6 +17,7 @@ pub mod indexer; mod merger; mod parallel_iterator_ext; mod ref_cell_ext; +pub(crate) mod steps; pub(crate) mod thread_local; mod top_level_map; pub mod vector_document; diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs new file mode 100644 index 000000000..60a0c872b --- /dev/null +++ b/crates/milli/src/update/new/steps.rs @@ -0,0 +1,45 @@ +use enum_iterator::Sequence; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] +#[repr(u16)] +pub enum Step { + ExtractingDocuments, + ExtractingFacets, + ExtractingWords, + ExtractingWordProximity, + ExtractingEmbeddings, + WritingGeoPoints, + WritingToDatabase, + WritingEmbeddingsToDatabase, + WaitingForExtractors, + PostProcessingFacets, + PostProcessingWords, + Finalizing, +} + +impl Step { + pub fn name(&self) -> &'static str { + match self { + Step::ExtractingDocuments => "extracting documents", + Step::ExtractingFacets => "extracting facets", + Step::ExtractingWords => "extracting words", + Step::ExtractingWordProximity => "extracting word proximity", + Step::ExtractingEmbeddings => "extracting embeddings", + Step::WritingGeoPoints => "writing geo points", + Step::WritingToDatabase => "writing to database", + Step::WritingEmbeddingsToDatabase => "writing embeddings to database", + Step::WaitingForExtractors => "waiting for extractors", + Step::PostProcessingFacets => "post-processing facets", + Step::PostProcessingWords => "post-processing words", + Step::Finalizing => "finalizing", + } + } + + pub fn finished_steps(self) -> u16 { + self as u16 + } + + pub const fn total_steps() -> u16 { + Self::CARDINALITY as u16 + } +} From bfefaf71c24f3f3e7cba43acfd377035f3d35e37 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 19 Nov 2024 09:32:52 +0100 Subject: [PATCH 246/247] Progress displayed in logs --- crates/index-scheduler/src/batch.rs | 20 ++++++++++++++++++-- crates/index-scheduler/src/lib.rs | 4 ++-- crates/meilisearch-types/src/tasks.rs | 25 ++++++++++++++----------- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index d94c8d9a5..c06cb6b42 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -39,7 +39,9 @@ use meilisearch_types::milli::vector::parsed_vectors::{ }; use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbortBuilder}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; -use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; +use meilisearch_types::tasks::{ + Details, IndexSwap, Kind, KindWithContent, Status, Task, TaskProgress, +}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; use roaring::RoaringBitmap; use time::macros::format_description; @@ -1240,7 +1242,21 @@ impl IndexScheduler { secs_since_started_processing_at .store((now - started_processing_at).as_secs(), atomic::Ordering::Relaxed); - processing_tasks.write().unwrap().update_progress(progress); + let TaskProgress { + current_step, + finished_steps, + total_steps, + finished_documents, + total_documents, + } = processing_tasks.write().unwrap().update_progress(progress); + + tracing::info!( + current_step, + finished_steps, + total_steps, + finished_documents, + total_documents + ); }; match operation { diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 5508eabab..687c2aa8e 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -182,8 +182,8 @@ impl ProcessingTasks { self.processing = processing; } - fn update_progress(&mut self, progress: Progress) { - self.progress.get_or_insert_with(TaskProgress::default).update(progress); + fn update_progress(&mut self, progress: Progress) -> TaskProgress { + self.progress.get_or_insert_with(TaskProgress::default).update(progress) } /// Set the processing tasks to an empty list diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 70e6ad294..7f4431da1 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -39,10 +39,10 @@ pub struct Task { pub kind: KindWithContent, } -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct TaskProgress { - pub current_step: String, + pub current_step: &'static str, pub finished_steps: u16, pub total_steps: u16, pub finished_documents: Option, @@ -58,7 +58,7 @@ impl Default for TaskProgress { impl TaskProgress { pub fn new() -> Self { Self { - current_step: String::new(), + current_step: "start", finished_steps: 0, total_steps: 1, finished_documents: None, @@ -66,15 +66,17 @@ impl TaskProgress { } } - pub fn update(&mut self, progress: Progress) { - if self.current_step != progress.step_name { - self.current_step.clear(); - self.current_step.push_str(progress.step_name); - } - self.total_steps = progress.total_steps; + pub fn update(&mut self, progress: Progress) -> TaskProgress { if self.finished_steps > progress.finished_steps { - return; + return *self; } + + if self.current_step != progress.step_name { + self.current_step = progress.step_name + } + + self.total_steps = progress.total_steps; + if self.finished_steps < progress.finished_steps { self.finished_documents = None; self.total_documents = None; @@ -83,12 +85,13 @@ impl TaskProgress { if let Some((finished_documents, total_documents)) = progress.finished_total_documents { if let Some(task_finished_documents) = self.finished_documents { if task_finished_documents > finished_documents { - return; + return *self; } } self.finished_documents = Some(finished_documents); self.total_documents = Some(total_documents); } + *self } } From 41dbdd2d1835beb631c5cc318fa8707af4ef627a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Nov 2024 16:08:25 +0100 Subject: [PATCH 247/247] Fix filtered_placeholder_search_should_not_return_deleted_documents and word_scale_set_and_reset --- .../extract_word_pair_proximity_docids.rs | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index bbc6365df..945f0b8b3 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -149,6 +149,15 @@ fn word_positions_into_word_pair_proximity( } } +fn drain_word_positions( + word_positions: &mut VecDeque<(Rc, u16)>, + word_pair_proximity: &mut impl FnMut((Rc, Rc), u8), +) { + while !word_positions.is_empty() { + word_positions_into_word_pair_proximity(word_positions, word_pair_proximity); + } +} + fn process_document_tokens<'doc>( document: impl Document<'doc>, document_tokenizer: &DocumentTokenizer, @@ -156,7 +165,12 @@ fn process_document_tokens<'doc>( word_positions: &mut VecDeque<(Rc, u16)>, word_pair_proximity: &mut impl FnMut((Rc, Rc), u8), ) -> Result<()> { - let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| { + let mut field_id = None; + let mut token_fn = |_fname: &str, fid: FieldId, pos: u16, word: &str| { + if field_id != Some(fid) { + field_id = Some(fid); + drain_word_positions(word_positions, word_pair_proximity); + } // drain the proximity window until the head word is considered close to the word we are inserting. while word_positions .front() @@ -171,9 +185,6 @@ fn process_document_tokens<'doc>( }; document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?; - while !word_positions.is_empty() { - word_positions_into_word_pair_proximity(word_positions, word_pair_proximity); - } - + drain_word_positions(word_positions, word_pair_proximity); Ok(()) }