From eded5558b2ecf858e515f47916d2f5a5613fe2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 22 Nov 2020 11:54:04 +0100 Subject: [PATCH] Rename the users ids documents ids into external documents ids --- http-ui/src/main.rs | 4 +- src/index.rs | 16 ++--- src/update/clear_documents.rs | 2 +- src/update/delete_documents.rs | 54 ++++++++--------- src/update/index_documents/mod.rs | 6 +- src/update/index_documents/transform.rs | 80 ++++++++++++------------- 6 files changed, 81 insertions(+), 81 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 094b2fb79..b730344f2 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -605,14 +605,14 @@ async fn main() -> anyhow::Result<()> { let index = index_cloned.clone(); let rtxn = index.read_txn().unwrap(); - let users_ids_documents_ids = index.users_ids_documents_ids(&rtxn).unwrap(); + let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let displayed_fields = match index.displayed_fields(&rtxn).unwrap() { Some(fields) => Cow::Borrowed(fields), None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()), }; - match users_ids_documents_ids.get(&id) { + match external_documents_ids.get(&id) { Some(document_id) => { let document_id = document_id as u32; let (_, obkv) = index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); diff --git a/src/index.rs b/src/index.rs index 68d7dfe5f..8da7940a7 100644 --- a/src/index.rs +++ b/src/index.rs @@ -22,7 +22,7 @@ pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; -pub const USERS_IDS_DOCUMENTS_IDS_KEY: &str = "users-ids-documents-ids"; +pub const EXTERNAL_DOCUMENTS_IDS_KEY: &str = "external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; #[derive(Clone)] @@ -119,18 +119,18 @@ impl Index { self.main.get::<_, Str, OwnedType>(rtxn, PRIMARY_KEY_KEY) } - /* users ids documents ids */ + /* external documents ids */ - /// Writes the users ids documents ids, a user id is a byte slice (i.e. `[u8]`) + /// Writes the external documents ids, it is a byte slice (i.e. `[u8]`) /// and refers to an internal id (i.e. `u32`). - pub fn put_users_ids_documents_ids>(&self, wtxn: &mut RwTxn, fst: &fst::Map) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS_KEY, fst.as_fst().as_bytes()) + pub fn put_external_documents_ids>(&self, wtxn: &mut RwTxn, fst: &fst::Map) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, EXTERNAL_DOCUMENTS_IDS_KEY, fst.as_fst().as_bytes()) } - /// Returns the user ids documents ids map which associate the user ids (i.e. `[u8]`) + /// Returns the external documents ids map which associate the external ids (i.e. `[u8]`) /// with the internal ids (i.e. `u32`). - pub fn users_ids_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, USERS_IDS_DOCUMENTS_IDS_KEY)? { + pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, EXTERNAL_DOCUMENTS_IDS_KEY)? { Some(bytes) => Ok(fst::Map::new(bytes)?.map_data(Cow::Borrowed)?), None => Ok(fst::Map::default().map_data(Cow::Owned)?), } diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index c49ae9104..0e89d43b7 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -27,7 +27,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; - self.index.put_users_ids_documents_ids(self.wtxn, &fst::Map::default())?; + self.index.put_external_documents_ids(self.wtxn, &fst::Map::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; // Clear the other databases. diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index d68bca81c..5ccce35f6 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -10,7 +10,7 @@ use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - users_ids_documents_ids: fst::Map>, + external_documents_ids: fst::Map>, documents_ids: RoaringBitmap, } @@ -20,14 +20,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { index: &'i Index, ) -> anyhow::Result> { - let users_ids_documents_ids = index - .users_ids_documents_ids(wtxn)? + let external_documents_ids = index + .external_documents_ids(wtxn)? .map_data(Cow::into_owned)?; Ok(DeleteDocuments { wtxn, index, - users_ids_documents_ids, + external_documents_ids, documents_ids: RoaringBitmap::new(), }) } @@ -40,8 +40,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.documents_ids.union_with(docids); } - pub fn delete_user_id(&mut self, user_id: &str) -> Option { - let docid = self.users_ids_documents_ids.get(user_id).map(|id| u32::try_from(id).unwrap())?; + pub fn delete_external_id(&mut self, external_id: &str) -> Option { + let docid = self.external_documents_ids.get(external_id).map(|id| u32::try_from(id).unwrap())?; self.delete_document(docid); Some(docid) } @@ -80,9 +80,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { documents, } = self.index; - // Retrieve the words and the users ids contained in the documents. + // Retrieve the words and the external documents ids contained in the documents. let mut words = Vec::new(); - let mut users_ids = Vec::new(); + let mut external_ids = Vec::new(); for docid in &self.documents_ids { // We create an iterator to be able to get the content and delete the document // content itself. It's faster to acquire a cursor to get and delete, @@ -91,8 +91,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; if let Some((_key, obkv)) = iter.next().transpose()? { if let Some(content) = obkv.get(id_field) { - let user_id: SmallString32 = serde_json::from_slice(content).unwrap(); - users_ids.push(user_id); + let external_id: SmallString32 = serde_json::from_slice(content).unwrap(); + external_ids.push(external_id); } iter.del_current()?; } @@ -109,30 +109,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } - // We create the FST map of the users ids that we must delete. - users_ids.sort_unstable(); - let users_ids_to_delete = fst::Set::from_iter(users_ids.iter().map(AsRef::as_ref))?; - let users_ids_to_delete = fst::Map::from(users_ids_to_delete.into_fst()); + // We create the FST map of the external ids that we must delete. + external_ids.sort_unstable(); + let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?; + let external_ids_to_delete = fst::Map::from(external_ids_to_delete.into_fst()); - let new_users_ids_documents_ids = { - // We acquire the current users ids documents ids map and create - // a difference operation between the current and to-delete users ids. - let users_ids_documents_ids = self.index.users_ids_documents_ids(self.wtxn)?; - let difference = users_ids_documents_ids.op().add(&users_ids_to_delete).difference(); + let new_external_documents_ids = { + // We acquire the current external documents ids map and create + // a difference operation between the current and to-delete external ids. + let external_documents_ids = self.index.external_documents_ids(self.wtxn)?; + let difference = external_documents_ids.op().add(&external_ids_to_delete).difference(); - // We stream the new users ids that does no more contains the to-delete users ids. + // We stream the new external ids that does no more contains the to-delete external ids. let mut iter = difference.into_stream(); - let mut new_users_ids_documents_ids_builder = fst::MapBuilder::memory(); - while let Some((userid, docids)) = iter.next() { - new_users_ids_documents_ids_builder.insert(userid, docids[0].value)?; + let mut new_external_documents_ids_builder = fst::MapBuilder::memory(); + while let Some((external_id, docids)) = iter.next() { + new_external_documents_ids_builder.insert(external_id, docids[0].value)?; } // We create an FST map from the above builder. - new_users_ids_documents_ids_builder.into_map() + new_external_documents_ids_builder.into_map() }; - // We write the new users ids into the main database. - self.index.put_users_ids_documents_ids(self.wtxn, &new_users_ids_documents_ids)?; + // We write the new external ids into the main database. + self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; // Maybe we can improve the get performance of the words // if we sort the words first, keeping the LMDB pages in cache. @@ -169,7 +169,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let words_fst = self.index.words_fst(self.wtxn)?; let difference = words_fst.op().add(&words_to_delete).difference(); - // We stream the new users ids that does no more contains the to-delete users ids. + // We stream the new external ids that does no more contains the to-delete external ids. let mut new_words_fst_builder = fst::SetBuilder::memory(); new_words_fst_builder.extend_stream(difference.into_stream())?; diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 13b725e19..fe51c6b2b 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -287,7 +287,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let TransformOutput { primary_key, fields_ids_map, - users_ids_documents_ids, + external_documents_ids, new_documents_ids, replaced_documents_ids, documents_count, @@ -472,8 +472,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, primary_key)?; - // We write the users_ids_documents_ids into the main database. - self.index.put_users_ids_documents_ids(self.wtxn, &users_ids_documents_ids)?; + // We write the external documents ids into the main database. + self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; // We merge the new documents ids with the existing ones. documents_ids.union_with(&new_documents_ids); diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs index 1fd4a4264..3c6acd1a9 100644 --- a/src/update/index_documents/transform.rs +++ b/src/update/index_documents/transform.rs @@ -20,14 +20,14 @@ use super::{create_writer, create_sorter, IndexDocumentsMethod}; pub struct TransformOutput { pub primary_key: u8, pub fields_ids_map: FieldsIdsMap, - pub users_ids_documents_ids: fst::Map>, + pub external_documents_ids: fst::Map>, pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, pub documents_count: usize, pub documents_file: File, } -/// Extract the users ids, deduplicate and compute the new internal documents ids +/// Extract the external ids, deduplicate and compute the new internal documents ids /// and fields ids, writing all the documents under their internal ids into a final file. /// /// Outputs the new `FieldsIdsMap`, the new `UsersIdsDocumentsIds` map, the new documents ids, @@ -74,7 +74,7 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap(); + let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let primary_key = self.index.primary_key(self.rtxn)?; // Deserialize the whole batch of documents in memory. @@ -116,7 +116,7 @@ impl Transform<'_, '_> { return Ok(TransformOutput { primary_key, fields_ids_map, - users_ids_documents_ids: fst::Map::default(), + external_documents_ids: fst::Map::default(), new_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(), documents_count: 0, @@ -172,7 +172,7 @@ impl Transform<'_, '_> { // We retrieve the user id from the document based on the primary key name, // if the document id isn't present we generate a uuid. - let user_id = match document.get(&primary_key_name) { + let external_id = match document.get(&primary_key_name) { Some(value) => match value { Value::String(string) => Cow::Borrowed(string.as_str()), Value::Number(number) => Cow::Owned(number.to_string()), @@ -200,19 +200,19 @@ impl Transform<'_, '_> { } else if field_id == primary_key { // We validate the document id [a-zA-Z0-9\-_]. - let user_id = match validate_document_id(&user_id) { + let external_id = match validate_document_id(&external_id) { Some(valid) => valid, - None => return Err(anyhow!("invalid document id: {:?}", user_id)), + None => return Err(anyhow!("invalid document id: {:?}", external_id)), }; // We serialize the document id. - serde_json::to_writer(&mut json_buffer, &user_id)?; + serde_json::to_writer(&mut json_buffer, &external_id)?; writer.insert(field_id, &json_buffer)?; } } // We use the extracted/generated user id as the key for this document. - sorter.insert(user_id.as_bytes(), &obkv_buffer)?; + sorter.insert(external_id.as_bytes(), &obkv_buffer)?; documents_count += 1; } @@ -227,7 +227,7 @@ impl Transform<'_, '_> { primary_key, fields_ids_map, documents_count, - users_ids_documents_ids, + external_documents_ids, progress_callback, ) } @@ -238,7 +238,7 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap(); + let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let mut csv = csv::Reader::from_reader(reader); let headers = csv.headers()?; @@ -252,7 +252,7 @@ impl Transform<'_, '_> { } // Extract the position of the primary key in the current headers, None if not found. - let user_id_pos = match primary_key { + let external_id_pos = match primary_key { Some(primary_key) => { // Te primary key have is known so we must find the position in the CSV headers. let name = fields_ids_map.name(primary_key).expect("found the primary key name"); @@ -263,7 +263,7 @@ impl Transform<'_, '_> { // Returns the field id in the fileds ids map, create an "id" field // in case it is not in the current headers. - let primary_key_field_id = match user_id_pos { + let primary_key_field_id = match external_id_pos { Some(pos) => fields_ids_map.id(&headers[pos]).expect("found the primary key"), None => { if !self.autogenerate_docids { @@ -294,7 +294,7 @@ impl Transform<'_, '_> { ); // We write into the sorter to merge and deduplicate the documents - // based on the users ids. + // based on the external ids. let mut json_buffer = Vec::new(); let mut obkv_buffer = Vec::new(); let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; @@ -312,13 +312,13 @@ impl Transform<'_, '_> { } // We extract the user id if we know where it is or generate an UUID V4 otherwise. - let user_id = match user_id_pos { + let external_id = match external_id_pos { Some(pos) => { - let user_id = &record[pos]; + let external_id = &record[pos]; // We validate the document id [a-zA-Z0-9\-_]. - match validate_document_id(&user_id) { + match validate_document_id(&external_id) { Some(valid) => valid, - None => return Err(anyhow!("invalid document id: {:?}", user_id)), + None => return Err(anyhow!("invalid document id: {:?}", external_id)), } }, None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), @@ -328,7 +328,7 @@ impl Transform<'_, '_> { // we return the generated document id instead of the record field. let iter = fields_ids.iter() .map(|(fi, i)| { - let field = if *fi == primary_key_field_id { user_id } else { &record[*i] }; + let field = if *fi == primary_key_field_id { external_id } else { &record[*i] }; (fi, field) }); @@ -341,7 +341,7 @@ impl Transform<'_, '_> { } // We use the extracted/generated user id as the key for this document. - sorter.insert(user_id, &obkv_buffer)?; + sorter.insert(external_id, &obkv_buffer)?; documents_count += 1; } @@ -356,7 +356,7 @@ impl Transform<'_, '_> { primary_key_field_id, fields_ids_map, documents_count, - users_ids_documents_ids, + external_documents_ids, progress_callback, ) } @@ -370,7 +370,7 @@ impl Transform<'_, '_> { primary_key: u8, fields_ids_map: FieldsIdsMap, approximate_number_of_documents: usize, - users_ids_documents_ids: fst::Map>, + external_documents_ids: fst::Map>, progress_callback: F, ) -> anyhow::Result where @@ -388,7 +388,7 @@ impl Transform<'_, '_> { self.max_nb_chunks, self.max_memory, ); - let mut new_users_ids_documents_ids_builder = fst::MapBuilder::memory(); + let mut new_external_documents_ids_builder = fst::MapBuilder::memory(); let mut replaced_documents_ids = RoaringBitmap::new(); let mut new_documents_ids = RoaringBitmap::new(); let mut obkv_buffer = Vec::new(); @@ -396,7 +396,7 @@ impl Transform<'_, '_> { // While we write into final file we get or generate the internal documents ids. let mut documents_count = 0; let mut iter = sorter.into_iter()?; - while let Some((user_id, update_obkv)) = iter.next()? { + while let Some((external_id, update_obkv)) = iter.next()? { if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { @@ -405,9 +405,9 @@ impl Transform<'_, '_> { }); } - let (docid, obkv) = match users_ids_documents_ids.get(user_id) { + let (docid, obkv) = match external_documents_ids.get(external_id) { Some(docid) => { - // If we find the user id in the current users ids documents ids map + // If we find the user id in the current external documents ids map // we use it and insert it in the list of replaced documents. let docid = u32::try_from(docid).expect("valid document id"); replaced_documents_ids.insert(docid); @@ -427,11 +427,11 @@ impl Transform<'_, '_> { } }, None => { - // If this user id is new we add it to the users ids documents ids map + // If this user id is new we add it to the external documents ids map // for new ids and into the list of new documents. let new_docid = available_documents_ids.next() .context("no more available documents ids")?; - new_users_ids_documents_ids_builder.insert(user_id, new_docid as u64)?; + new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; new_documents_ids.insert(new_docid); (new_docid, update_obkv) }, @@ -457,28 +457,28 @@ impl Transform<'_, '_> { let mut documents_file = writer.into_inner()?; documents_file.seek(SeekFrom::Start(0))?; - // We create the union between the existing users ids documents ids with the new ones. - let new_users_ids_documents_ids = new_users_ids_documents_ids_builder.into_map(); + // We create the union between the existing external documents ids with the new ones. + let new_external_documents_ids = new_external_documents_ids_builder.into_map(); let union_op = fst::map::OpBuilder::new() - .add(&users_ids_documents_ids) - .add(&new_users_ids_documents_ids) + .add(&external_documents_ids) + .add(&new_external_documents_ids) .r#union(); - // We stream and merge the new users ids documents ids map with the existing one. + // We stream and merge the new external documents ids map with the existing one. let before_docids_merging = Instant::now(); - let mut users_ids_documents_ids_builder = fst::MapBuilder::memory(); + let mut external_documents_ids_builder = fst::MapBuilder::memory(); let mut iter = union_op.into_stream(); - while let Some((user_id, vals)) = iter.next() { + while let Some((external_id, vals)) = iter.next() { assert_eq!(vals.len(), 1, "there must be exactly one document id"); - users_ids_documents_ids_builder.insert(user_id, vals[0].value)?; + external_documents_ids_builder.insert(external_id, vals[0].value)?; } - info!("Documents users ids merging took {:.02?}", before_docids_merging.elapsed()); + info!("Documents external merging took {:.02?}", before_docids_merging.elapsed()); Ok(TransformOutput { primary_key, fields_ids_map, - users_ids_documents_ids: users_ids_documents_ids_builder.into_map(), + external_documents_ids: external_documents_ids_builder.into_map(), new_documents_ids, replaced_documents_ids, documents_count, @@ -496,7 +496,7 @@ impl Transform<'_, '_> { ) -> anyhow::Result { let current_fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn)?; + let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_count = documents_ids.len() as usize; @@ -531,7 +531,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map, - users_ids_documents_ids: users_ids_documents_ids.map_data(Cow::into_owned)?, + external_documents_ids: external_documents_ids.map_data(Cow::into_owned)?, new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), documents_count,