From 9d47ee52b40e5254496ee548d878db96a0a776c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Sat, 31 Oct 2020 12:54:43 +0100
Subject: [PATCH] Generate a uuid v4 based document id when missing

---
 Cargo.lock                              | 10 ++++
 Cargo.toml                              |  1 +
 src/update/index_documents/mod.rs       | 74 +++++++++++++++++++++++++
 src/update/index_documents/transform.rs | 69 ++++++++++++++++++-----
 4 files changed, 140 insertions(+), 14 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 63c0ea192..6ae7d16ea 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1054,6 +1054,7 @@ dependencies = [
  "structopt",
  "tempfile",
  "tokio",
+ "uuid",
  "warp",
 ]
 
@@ -2259,6 +2260,15 @@ version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7"
 
+[[package]]
+name = "uuid"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fde2f6a4bea1d6e007c4ad38c6839fa71cbb63b6dbf5b595aa38dc9b1093c11"
+dependencies = [
+ "rand 0.7.3",
+]
+
 [[package]]
 name = "version_check"
 version = "0.1.5"
diff --git a/Cargo.toml b/Cargo.toml
index d4a67c623..27f4d3a2f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,7 @@ smallstr =  { version = "0.2.0", features = ["serde"] }
 smallvec = "1.4.0"
 structopt = { version = "0.3.14", default-features = false, features = ["wrap_help"] }
 tempfile = "3.1.0"
+uuid = { version = "0.8.1", features = ["v4"] }
 
 # documents words self-join
 itertools = "0.9.0"
diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs
index 10fd83f36..bec82b01e 100644
--- a/src/update/index_documents/mod.rs
+++ b/src/update/index_documents/mod.rs
@@ -275,6 +275,7 @@ impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> {
         };
 
         let TransformOutput {
+            primary_key,
             fields_ids_map,
             users_ids_documents_ids,
             new_documents_ids,
@@ -415,6 +416,9 @@ impl<'t, 'u, 'i> IndexDocuments<'t, 'u, 'i> {
         // We write the fields ids map into the main database
         self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
 
+        // We write the primary key field id into the main database
+        self.index.put_primary_key(self.wtxn, primary_key)?;
+
         // We write the users_ids_documents_ids into the main database.
         self.index.put_users_ids_documents_ids(self.wtxn, &users_ids_documents_ids)?;
 
@@ -597,4 +601,74 @@ mod tests {
         assert_eq!(doc_iter.next(), None);
         drop(rtxn);
     }
+
+    #[test]
+    fn simple_auto_generated_documents_ids() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+
+        let index = Index::new(options, &path).unwrap();
+
+        // First we send 3 documents with ids from 1 to 3.
+        let mut wtxn = index.write_txn().unwrap();
+        let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
+        IndexDocuments::new(&mut wtxn, &index).execute(content, |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Check that there is 3 documents now.
+        let rtxn = index.read_txn().unwrap();
+        let count = index.number_of_documents(&rtxn).unwrap();
+        assert_eq!(count, 3);
+
+        let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
+        let (_id, obkv) = docs.iter().find(|(_id, kv)| kv.get(0) == Some(br#""kevin""#)).unwrap();
+        let kevin_uuid: String = serde_json::from_slice(&obkv.get(1).unwrap()).unwrap();
+        drop(rtxn);
+
+        // Second we send 1 document with the generated uuid, to erase the previous ones.
+        let mut wtxn = index.write_txn().unwrap();
+        let content = format!("id,name\n{},updated kevin", kevin_uuid);
+        IndexDocuments::new(&mut wtxn, &index).execute(content.as_bytes(), |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Check that there is **always** 3 documents.
+        let rtxn = index.read_txn().unwrap();
+        let count = index.number_of_documents(&rtxn).unwrap();
+        assert_eq!(count, 3);
+        drop(rtxn);
+    }
+
+    #[test]
+    fn reordered_auto_generated_documents_ids() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+
+        let index = Index::new(options, &path).unwrap();
+
+        // First we send 3 documents with ids from 1 to 3.
+        let mut wtxn = index.write_txn().unwrap();
+        let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
+        IndexDocuments::new(&mut wtxn, &index).execute(content, |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Check that there is 3 documents now.
+        let rtxn = index.read_txn().unwrap();
+        let count = index.number_of_documents(&rtxn).unwrap();
+        assert_eq!(count, 3);
+        drop(rtxn);
+
+        // Second we send 1 document without specifying the id.
+        let mut wtxn = index.write_txn().unwrap();
+        let content = &b"name\nnew kevin"[..];
+        IndexDocuments::new(&mut wtxn, &index).execute(content, |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Check that there is 4 documents now.
+        let rtxn = index.read_txn().unwrap();
+        let count = index.number_of_documents(&rtxn).unwrap();
+        assert_eq!(count, 4);
+        drop(rtxn);
+    }
 }
diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs
index b6ec88879..33f18a48a 100644
--- a/src/update/index_documents/transform.rs
+++ b/src/update/index_documents/transform.rs
@@ -14,6 +14,7 @@ use super::merge_function::merge_two_obkvs;
 use super::{create_writer, create_sorter, IndexDocumentsMethod};
 
 pub struct TransformOutput {
+    pub primary_key: u8,
     pub fields_ids_map: FieldsIdsMap,
     pub users_ids_documents_ids: fst::Map<Vec<u8>>,
     pub new_documents_ids: RoaringBitmap,
@@ -23,7 +24,7 @@ pub struct TransformOutput {
 }
 
 pub struct Transform<'t, 'i> {
-    pub rtxn: &'t heed::RoTxn<'t>,
+    pub rtxn: &'t heed::RoTxn<'i>,
     pub index: &'i Index,
     pub chunk_compression_type: CompressionType,
     pub chunk_compression_level: Option<u32>,
@@ -47,17 +48,43 @@ impl Transform<'_, '_> {
         let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap();
 
         let mut csv = csv::Reader::from_reader(reader);
-        let headers = csv.headers()?.clone();
-        let user_id_pos = headers.iter().position(|h| h == "id").context(r#"missing "id" header"#)?;
+        let headers = csv.headers()?;
+        let primary_key = self.index.primary_key(self.rtxn)?;
 
         // Generate the new fields ids based on the current fields ids and this CSV headers.
         let mut fields_ids = Vec::new();
-        for header in headers.iter() {
-            let id = fields_ids_map.insert(header)
-                .context("impossible to generate a field id (limit reached)")?;
-            fields_ids.push(id);
+        for (i, header) in headers.iter().enumerate() {
+            let id = fields_ids_map.insert(header).context("field id limit reached)")?;
+            fields_ids.push((id, i));
         }
 
+        // Extract the position of the primary key in the current headers, None if not found.
+        let user_id_pos = match primary_key {
+            Some(primary_key) => {
+                // Te primary key have is known so we must find the position in the CSV headers.
+                let name = fields_ids_map.name(primary_key).expect("found the primary key name");
+                headers.iter().position(|h| h == name)
+            },
+            None => headers.iter().position(|h| h.contains("id")),
+        };
+
+        // Returns the field id in the fileds ids map, create an "id" field
+        // in case it is not in the current headers.
+        let primary_key_field_id = match user_id_pos {
+            Some(pos) => fields_ids_map.id(&headers[pos]).expect("found the primary key"),
+            None => {
+                let id = fields_ids_map.insert("id").context("field id limit reached")?;
+                // We make sure to add the primary key field id to the fields ids,
+                // this way it is added to the obks.
+                fields_ids.push((id, usize::max_value()));
+                id
+            },
+        };
+
+        // We sort the fields ids by the fields ids map id, this way we are sure to iterate over
+        // the records fields in the fields ids map order and correctly generate the obkv.
+        fields_ids.sort_unstable_by_key(|(field_id, _)| *field_id);
+
         /// Only the last value associated with an id is kept.
         fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
             obkvs.last().context("no last value").map(|last| last.clone().into_owned())
@@ -77,24 +104,37 @@ impl Transform<'_, '_> {
         // based on the users ids.
         let mut json_buffer = Vec::new();
         let mut obkv_buffer = Vec::new();
+        let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
         let mut record = csv::StringRecord::new();
         while csv.read_record(&mut record)? {
 
             obkv_buffer.clear();
             let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
 
-            // We retrieve the field id based on the CSV header position
-            // and zip it with the record value.
-            for (key, field) in fields_ids.iter().copied().zip(&record) {
+            // We extract the user id if we know where it is or generate an UUID V4 otherwise.
+            // TODO we must validate the user id (i.e. [a-zA-Z0-9\-_]).
+            let user_id = match user_id_pos {
+                Some(pos) => &record[pos],
+                None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
+            };
+
+            // When the primary_key_field_id is found in the fields ids list
+            // we return the generated document id instead of the record field.
+            let iter = fields_ids.iter()
+                .map(|(fi, i)| {
+                    let field = if *fi == primary_key_field_id { user_id } else { &record[*i] };
+                    (fi, field)
+                });
+
+            // We retrieve the field id based on the fields ids map fields ids order.
+            for (field_id, field) in iter {
                 // We serialize the attribute values as JSON strings.
                 json_buffer.clear();
                 serde_json::to_writer(&mut json_buffer, &field)?;
-                writer.insert(key, &json_buffer)?;
+                writer.insert(*field_id, &json_buffer)?;
             }
 
-            // We extract the user id and use it as the key for this document.
-            // TODO we must validate the user id (i.e. [a-zA-Z0-9\-_]).
-            let user_id = &record[user_id_pos];
+            // We use the extracted/generated user id as the key for this document.
             sorter.insert(user_id, &obkv_buffer)?;
         }
 
@@ -179,6 +219,7 @@ impl Transform<'_, '_> {
         }
 
         Ok(TransformOutput {
+            primary_key: primary_key_field_id,
             fields_ids_map,
             users_ids_documents_ids: users_ids_documents_ids_builder.into_map(),
             new_documents_ids,