diff --git a/milli/src/index.rs b/milli/src/index.rs index cba9b134f..2faf8d1f8 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -791,7 +791,7 @@ pub(crate) mod tests { use std::ops::Deref; use heed::EnvOpenOptions; - use maplit::hashmap; + use maplit::btreemap; use tempfile::TempDir; use crate::update::{IndexDocuments, UpdateFormat}; @@ -845,11 +845,54 @@ pub(crate) mod tests { let field_distribution = index.field_distribution(&rtxn).unwrap(); assert_eq!( field_distribution, - hashmap! { + btreemap! { "id".to_string() => 2, "name".to_string() => 2, "age".to_string() => 1, } ); + + // we add all the documents a second time. we are supposed to get the same + // field_distribution in the end + let mut wtxn = index.write_txn().unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let field_distribution = index.field_distribution(&rtxn).unwrap(); + assert_eq!( + field_distribution, + btreemap! { + "id".to_string() => 2, + "name".to_string() => 2, + "age".to_string() => 1, + } + ); + + // then we update a document by removing one field and another by adding one field + let content = &br#"[ + { "id": 1, "name": "kevin", "has_dog": true }, + { "id": 2, "name": "bob" } + ]"#[..]; + let mut wtxn = index.write_txn().unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let field_distribution = index.field_distribution(&rtxn).unwrap(); + assert_eq!( + field_distribution, + btreemap! { + "id".to_string() => 2, + "name".to_string() => 2, + "has_dog".to_string() => 1, + } + ); } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a92e87e05..e88ac62d5 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -14,7 +14,7 @@ pub mod tree_level; pub mod update; use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::hash::BuildHasherDefault; use std::result::Result as StdResult; @@ -50,7 +50,7 @@ pub type Attribute = u32; pub type DocumentId = u32; pub type FieldId = u8; pub type Position = u32; -pub type FieldsDistribution = HashMap; +pub type FieldsDistribution = BTreeMap; type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 4276de672..e291eb106 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,4 +1,4 @@ -use std::collections::hash_map::Entry; +use std::collections::btree_map::Entry; use std::collections::HashMap; use chrono::Utc; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 0ff068ebb..074d281ba 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::btree_map::Entry; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::iter::Peekable; @@ -419,18 +420,32 @@ impl Transform<'_, '_> { // we use it and insert it in the list of replaced documents. replaced_documents_ids.insert(docid); + let key = BEU32::new(docid); + let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + }, + )?; + + // we remove all the fields that were already counted + for (field_id, _) in base_obkv.iter() { + let field_name = fields_ids_map.name(field_id).unwrap(); + if let Entry::Occupied(mut entry) = + field_distribution.entry(field_name.to_string()) + { + match entry.get().checked_sub(1) { + Some(0) | None => entry.remove(), + Some(count) => entry.insert(count), + }; + } + } + // Depending on the update indexing method we will merge // the document update with the current document or not. match self.index_documents_method { IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), IndexDocumentsMethod::UpdateDocuments => { - let key = BEU32::new(docid); - let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( - InternalError::DatabaseMissingEntry { - db_name: db_name::DOCUMENTS, - key: None, - }, - )?; let update_obkv = obkv::KvReader::new(update_obkv); merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); (docid, obkv_buffer.as_slice())