Merge #187

187: Fix fields distribution after documents merge r=Kerollmops a=shekhirin Resolves https://github.com/meilisearch/milli/issues/174 The problem was with calculation of fields distribution before the merge in `output_from_sorter()`. So if you'd import two documents with the same primary key value, fields distribution will count it as two documents, while `output_from_sorter()` will merge these documents into one. --- ```console ➜ Downloads cat short_movies.json [ {"id":"47474","title":"The Serpent's Egg","poster":"https://image.tmdb.org/t/p/w500/n7z0doFkXHcvo8QQWHLFnkEPXRU.jpg","overview":"The Serpent's Egg follows a week in the life of Abel Rosenberg, an out-of-work American circus acrobat living in poverty-stricken Berlin following Germany's defeat in World War I.","release_date":246844800,"genres":["Thriller","Drama","Mystery"]}, {"id":"47474","title":"The Serpent's Egg","poster":"https://image.tmdb.org/t/p/w500/n7z0doFkXHcvo8QQWHLFnkEPXRU.jpg","overview":"The Serpent's Egg follows a week in the life of Abel Rosenberg, an out-of-work American circus acrobat living in poverty-stricken Berlin following Germany's defeat in World War I.","release_date":246844800,"genres":["Thriller","Drama","Mystery"]} ] ➜ Downloads curl -X POST -H "Content-Type: text/json" --data-binary @short_movies.json 127.0.0.1:7700/indexes/movies/documents {"updateId":0} ``` ## Before ```console ➜ Downloads curl -s 127.0.0.1:7700/indexes/movies/stats | jq { "numberOfDocuments": 1, "isIndexing": false, "fieldsDistribution": { "release_date": 2, "poster": 2, "title": 2, "overview": 2, "genres": 2, "id": 2 } } ``` ## After ```console ➜ Downloads curl -s 127.0.0.1:7700/indexes/movies/stats | jq { "numberOfDocuments": 1, "isIndexing": false, "fieldsDistribution": { "poster": 1, "release_date": 1, "title": 1, "genres": 1, "id": 1, "overview": 1 } } ``` Co-authored-by: Alexey Shekhirin <a.shekhirin@gmail.com>
2024-11-30 17:14:59 +08:00 · 2021-05-05 14:45:08 +00:00 · 2021-05-05 14:45:08 +00:00 · 7e63e32960
commit 7e63e32960
parent 1207a058d0 f8d0f5265f
2 changed files with 13 additions and 30 deletions
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -566,11 +566,11 @@ pub(crate) mod tests {
        let mut wtxn = index.write_txn().unwrap();
        let content = &br#"[
-            { "name": "kevin" },
+            { "id": 1, "name": "kevin" },
-            { "name": "bob", "age": 20 }
+            { "id": 2, "name": "bob", "age": 20 },
            { "id": 2, "name": "bob", "age": 20 }
        ]"#[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
        builder.enable_autogenerate_docids();
        builder.update_format(UpdateFormat::Json);
        builder.execute(content, |_, _| ()).unwrap();
        wtxn.commit().unwrap();
@ -579,6 +579,7 @@ pub(crate) mod tests {
        let fields_distribution = index.fields_distribution(&rtxn).unwrap();
        assert_eq!(fields_distribution, hashmap! {
            "id".to_string() => 2,
            "name".to_string() => 2,
            "age".to_string() => 1,
        });
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -1,5 +1,4 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom};
 use std::iter::Peekable;
@ -76,7 +75,6 @@ impl Transform<'_, '_> {
        F: Fn(UpdateIndexingStep) + Sync,
    {
        let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
        let mut fields_distribution = self.index.fields_distribution(self.rtxn)?;
        let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
        // Deserialize the whole batch of documents in memory.
@ -106,7 +104,7 @@ impl Transform<'_, '_> {
            return Ok(TransformOutput {
                primary_key,
                fields_ids_map,
-                fields_distribution,
+                fields_distribution: self.index.fields_distribution(self.rtxn)?,
                external_documents_ids: ExternalDocumentsIds::default(),
                new_documents_ids: RoaringBitmap::new(),
                replaced_documents_ids: RoaringBitmap::new(),
@ -137,8 +135,6 @@ impl Transform<'_, '_> {
        let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
        let mut documents_count = 0;
        let mut fields_ids_distribution = HashMap::new();
        for result in documents {
            let document = result?;
@ -153,9 +149,7 @@ impl Transform<'_, '_> {
            // We prepare the fields ids map with the documents keys.
            for (key, _value) in &document {
-                let field_id = fields_ids_map.insert(&key).context("field id limit reached")?;
+                fields_ids_map.insert(&key).context("field id limit reached")?;
                *fields_ids_distribution.entry(field_id).or_insert(0) += 1;
            }
            // We retrieve the user id from the document based on the primary key name,
@ -198,11 +192,6 @@ impl Transform<'_, '_> {
            documents_count += 1;
        }
        for (field_id, count) in fields_ids_distribution {
            let field_name = fields_ids_map.name(field_id).unwrap();
            *fields_distribution.entry(field_name.to_string()).or_default() += count;
        }
        progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
            documents_seen: documents_count,
        });
@ -213,7 +202,6 @@ impl Transform<'_, '_> {
            sorter,
            primary_key,
            fields_ids_map,
            fields_distribution,
            documents_count,
            external_documents_ids,
            progress_callback,
@ -226,7 +214,6 @@ impl Transform<'_, '_> {
        F: Fn(UpdateIndexingStep) + Sync,
    {
        let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
        let mut fields_distribution = self.index.fields_distribution(self.rtxn)?;
        let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
        let mut csv = csv::Reader::from_reader(reader);
@ -284,8 +271,6 @@ impl Transform<'_, '_> {
        let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
        let mut documents_count = 0;
        let mut fields_ids_distribution = HashMap::new();
        let mut record = csv::StringRecord::new();
        while csv.read_record(&mut record)? {
            obkv_buffer.clear();
@ -324,8 +309,6 @@ impl Transform<'_, '_> {
                json_buffer.clear();
                serde_json::to_writer(&mut json_buffer, &field)?;
                writer.insert(*field_id, &json_buffer)?;
                *fields_ids_distribution.entry(*field_id).or_insert(0) += 1;
            }
            // We use the extracted/generated user id as the key for this document.
@ -333,11 +316,6 @@ impl Transform<'_, '_> {
            documents_count += 1;
        }
        for (field_id, count) in fields_ids_distribution {
            let field_name = fields_ids_map.name(field_id).unwrap();
            *fields_distribution.entry(field_name.to_string()).or_default() += count;
        }
        progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
            documents_seen: documents_count,
        });
@ -352,7 +330,6 @@ impl Transform<'_, '_> {
            sorter,
            primary_key_name,
            fields_ids_map,
            fields_distribution,
            documents_count,
            external_documents_ids,
            progress_callback,
@ -367,7 +344,6 @@ impl Transform<'_, '_> {
        sorter: grenad::Sorter<MergeFn>,
        primary_key: String,
        fields_ids_map: FieldsIdsMap,
        fields_distribution: FieldsDistribution,
        approximate_number_of_documents: usize,
        mut external_documents_ids: ExternalDocumentsIds<'_>,
        progress_callback: F,
@ -376,6 +352,7 @@ impl Transform<'_, '_> {
        F: Fn(UpdateIndexingStep) + Sync,
    {
        let documents_ids = self.index.documents_ids(self.rtxn)?;
        let mut fields_distribution = self.index.fields_distribution(self.rtxn)?;
        let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
        // Once we have sort and deduplicated the documents we write them into a final file.
@ -396,7 +373,6 @@ impl Transform<'_, '_> {
        let mut documents_count = 0;
        let mut iter = sorter.into_iter()?;
        while let Some((external_id, update_obkv)) = iter.next()? {
            if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
                progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
                    documents_seen: documents_count,
@ -438,6 +414,12 @@ impl Transform<'_, '_> {
            // We insert the document under the documents ids map into the final file.
            final_sorter.insert(docid.to_be_bytes(), obkv)?;
            documents_count += 1;
            let reader = obkv::KvReader::new(obkv);
            for (field_id, _) in reader.iter() {
                let field_name = fields_ids_map.name(field_id).unwrap();
                *fields_distribution.entry(field_name.to_string()).or_default() += 1;
            }
        }
        progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {