convert the field_distribution to a BTreeMap and avoid counting twice the same documents

2025-02-20 17:45:54 +08:00 · 2021-06-17 17:05:34 +02:00 · 2021-06-17 17:05:34 +02:00 · d08cfda796
commit d08cfda796
parent 969adaefdf
4 changed files with 70 additions and 12 deletions
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -791,7 +791,7 @@ pub(crate) mod tests {
    use std::ops::Deref;

    use heed::EnvOpenOptions;
-    use maplit::hashmap;
+    use maplit::btreemap;
    use tempfile::TempDir;

    use crate::update::{IndexDocuments, UpdateFormat};
@ -845,11 +845,54 @@ pub(crate) mod tests {
        let field_distribution = index.field_distribution(&rtxn).unwrap();
        assert_eq!(
            field_distribution,
-            hashmap! {
+            btreemap! {
                "id".to_string() => 2,
                "name".to_string() => 2,
                "age".to_string() => 1,
            }
        );
+
+        // we add all the documents a second time. we are supposed to get the same
+        // field_distribution in the end
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
+        builder.update_format(UpdateFormat::Json);
+        builder.execute(content, |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+
+        let field_distribution = index.field_distribution(&rtxn).unwrap();
+        assert_eq!(
+            field_distribution,
+            btreemap! {
+                "id".to_string() => 2,
+                "name".to_string() => 2,
+                "age".to_string() => 1,
+            }
+        );
+
+        // then we update a document by removing one field and another by adding one field
+        let content = &br#"[
+            { "id": 1, "name": "kevin", "has_dog": true },
+            { "id": 2, "name": "bob" }
+        ]"#[..];
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
+        builder.update_format(UpdateFormat::Json);
+        builder.execute(content, |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+
+        let field_distribution = index.field_distribution(&rtxn).unwrap();
+        assert_eq!(
+            field_distribution,
+            btreemap! {
+                "id".to_string() => 2,
+                "name".to_string() => 2,
+                "has_dog".to_string() => 1,
+            }
+        );
    }
 }
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -14,7 +14,7 @@ pub mod tree_level;
 pub mod update;

 use std::borrow::Cow;
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::hash::BuildHasherDefault;
 use std::result::Result as StdResult;

@ -50,7 +50,7 @@ pub type Attribute = u32;
 pub type DocumentId = u32;
 pub type FieldId = u8;
 pub type Position = u32;
-pub type FieldsDistribution = HashMap<String, u64>;
+pub type FieldsDistribution = BTreeMap<String, u64>;

 type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>;

--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -1,4 +1,4 @@
-use std::collections::hash_map::Entry;
+use std::collections::btree_map::Entry;
 use std::collections::HashMap;

 use chrono::Utc;
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -1,4 +1,5 @@
 use std::borrow::Cow;
+use std::collections::btree_map::Entry;
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom};
 use std::iter::Peekable;
@ -419,18 +420,32 @@ impl Transform<'_, '_> {
                    // we use it and insert it in the list of replaced documents.
                    replaced_documents_ids.insert(docid);

+                    let key = BEU32::new(docid);
+                    let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
+                        InternalError::DatabaseMissingEntry {
+                            db_name: db_name::DOCUMENTS,
+                            key: None,
+                        },
+                    )?;
+
+                    // we remove all the fields that were already counted
+                    for (field_id, _) in base_obkv.iter() {
+                        let field_name = fields_ids_map.name(field_id).unwrap();
+                        if let Entry::Occupied(mut entry) =
+                            field_distribution.entry(field_name.to_string())
+                        {
+                            match entry.get().checked_sub(1) {
+                                Some(0) | None => entry.remove(),
+                                Some(count) => entry.insert(count),
+                            };
+                        }
+                    }
+
                    // Depending on the update indexing method we will merge
                    // the document update with the current document or not.
                    match self.index_documents_method {
                        IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
                        IndexDocumentsMethod::UpdateDocuments => {
-                            let key = BEU32::new(docid);
-                            let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
-                                InternalError::DatabaseMissingEntry {
-                                    db_name: db_name::DOCUMENTS,
-                                    key: None,
-                                },
-                            )?;
                            let update_obkv = obkv::KvReader::new(update_obkv);
                            merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
                            (docid, obkv_buffer.as_slice())