Introduce a parameter to disable the engine to autogenerate docids

2024-11-27 04:25:06 +08:00 · 2020-10-31 21:46:55 +01:00 · 2020-10-31 21:46:55 +01:00 · 6d52c5b2f0
commit 6d52c5b2f0
parent 21b4d60101
2 changed files with 83 additions and 4 deletions
--- a/src/update/index_documents/mod.rs
+++ b/src/update/index_documents/mod.rs
@ -202,6 +202,7 @@ pub struct IndexDocuments<'t, 'u, 'i> {
    indexing_jobs: Option<usize>,
    update_method: IndexDocumentsMethod,
    update_format: UpdateFormat,
+    autogenerate_docids: bool,
 }

 impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> {
@ -219,6 +220,7 @@ impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> {
            indexing_jobs: None,
            update_method: IndexDocumentsMethod::ReplaceDocuments,
            update_format: UpdateFormat::Json,
+            autogenerate_docids: true,
        }
    }

@ -272,6 +274,16 @@ impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> {
        self
    }

+    pub fn enable_autogenerate_docids(&mut self) -> &mut Self {
+        self.autogenerate_docids = true;
+        self
+    }
+
+    pub fn disable_autogenerate_docids(&mut self) -> &mut Self {
+        self.autogenerate_docids = false;
+        self
+    }
+
    pub fn execute<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<()>
    where
        R: io::Read,
@ -288,6 +300,7 @@ impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> {
            max_nb_chunks: self.max_nb_chunks,
            max_memory: self.max_memory,
            index_documents_method: self.update_method,
+            autogenerate_docids: self.autogenerate_docids,
        };

        let output = match self.update_format {
@ -636,6 +649,56 @@ mod tests {
        drop(rtxn);
    }

+    #[test]
+    fn not_auto_generated_csv_documents_ids() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // First we send 3 documents with ids from 1 to 3.
+        let mut wtxn = index.write_txn().unwrap();
+        let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
+        let mut builder = IndexDocuments::new(&mut wtxn, &index);
+        builder.disable_autogenerate_docids();
+        builder.update_format(UpdateFormat::Csv);
+        assert!(builder.execute(content, |_, _| ()).is_err());
+        wtxn.commit().unwrap();
+
+        // Check that there is no document.
+        let rtxn = index.read_txn().unwrap();
+        let count = index.number_of_documents(&rtxn).unwrap();
+        assert_eq!(count, 0);
+        drop(rtxn);
+    }
+
+    #[test]
+    fn not_auto_generated_json_documents_ids() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // First we send 3 documents and 2 without ids.
+        let mut wtxn = index.write_txn().unwrap();
+        let content = &br#"[
+            { "name": "kevina", "id": 21 },
+            { "name": "kevin" },
+            { "name": "benoit" }
+        ]"#[..];
+        let mut builder = IndexDocuments::new(&mut wtxn, &index);
+        builder.disable_autogenerate_docids();
+        builder.update_format(UpdateFormat::Json);
+        assert!(builder.execute(content, |_, _| ()).is_err());
+        wtxn.commit().unwrap();
+
+        // Check that there is no document.
+        let rtxn = index.read_txn().unwrap();
+        let count = index.number_of_documents(&rtxn).unwrap();
+        assert_eq!(count, 0);
+        drop(rtxn);
+    }
+
    #[test]
    fn simple_auto_generated_documents_ids() {
        let path = tempfile::tempdir().unwrap();
--- a/src/update/index_documents/transform.rs
+++ b/src/update/index_documents/transform.rs
@ -33,6 +33,7 @@ pub struct Transform<'t, 'i> {
    pub max_nb_chunks: Option<usize>,
    pub max_memory: Option<usize>,
    pub index_documents_method: IndexDocumentsMethod,
+    pub autogenerate_docids: bool,
 }

 impl Transform<'_, '_> {
@ -57,7 +58,14 @@ impl Transform<'_, '_> {
            None => {
                match documents.get(0).and_then(|doc| doc.keys().find(|k| k.contains("id"))) {
                    Some(key) => fields_ids_map.insert(&key).context("field id limit reached")?,
-                    None => fields_ids_map.insert("id").context("field id limit reached")?,
+                    None => {
+                        if !self.autogenerate_docids {
+                            // If there is no primary key in the current document batch, we must
+                            // return an error and not automatically generate any document id.
+                            return Err(anyhow!("missing primary key"))
+                        }
+                        fields_ids_map.insert("id").context("field id limit reached")?
+                    },
                }
            },
        };
@ -130,6 +138,9 @@ impl Transform<'_, '_> {
                    _ => return Err(anyhow!("documents ids must be either strings or numbers")),
                },
                None => {
+                    if !self.autogenerate_docids {
+                        return Err(anyhow!("missing primary key"));
+                    }
                    let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
                    Cow::Borrowed(uuid)
                },
@ -180,11 +191,16 @@ impl Transform<'_, '_> {
        let primary_key_field_id = match user_id_pos {
            Some(pos) => fields_ids_map.id(&headers[pos]).expect("found the primary key"),
            None => {
-                let id = fields_ids_map.insert("id").context("field id limit reached")?;
+                if !self.autogenerate_docids {
+                    // If there is no primary key in the current document batch, we must
+                    // return an error and not automatically generate any document id.
+                    return Err(anyhow!("missing primary key"))
+                }
+                let field_id = fields_ids_map.insert("id").context("field id limit reached")?;
                // We make sure to add the primary key field id to the fields ids,
                // this way it is added to the obks.
-                fields_ids.push((id, usize::max_value()));
-                id
+                fields_ids.push((field_id, usize::max_value()));
+                field_id
            },
        };