Rename the users ids documents ids into external documents ids

This commit is contained in:
Clément Renault 2020-11-22 11:54:04 +01:00
parent f06355b0bb
commit eded5558b2
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
6 changed files with 81 additions and 81 deletions

View File

@ -605,14 +605,14 @@ async fn main() -> anyhow::Result<()> {
let index = index_cloned.clone(); let index = index_cloned.clone();
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let users_ids_documents_ids = index.users_ids_documents_ids(&rtxn).unwrap(); let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let displayed_fields = match index.displayed_fields(&rtxn).unwrap() { let displayed_fields = match index.displayed_fields(&rtxn).unwrap() {
Some(fields) => Cow::Borrowed(fields), Some(fields) => Cow::Borrowed(fields),
None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()), None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()),
}; };
match users_ids_documents_ids.get(&id) { match external_documents_ids.get(&id) {
Some(document_id) => { Some(document_id) => {
let document_id = document_id as u32; let document_id = document_id as u32;
let (_, obkv) = index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); let (_, obkv) = index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap();

View File

@ -22,7 +22,7 @@ pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const PRIMARY_KEY_KEY: &str = "primary-key";
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
pub const USERS_IDS_DOCUMENTS_IDS_KEY: &str = "users-ids-documents-ids"; pub const EXTERNAL_DOCUMENTS_IDS_KEY: &str = "external-documents-ids";
pub const WORDS_FST_KEY: &str = "words-fst"; pub const WORDS_FST_KEY: &str = "words-fst";
#[derive(Clone)] #[derive(Clone)]
@ -119,18 +119,18 @@ impl Index {
self.main.get::<_, Str, OwnedType<u8>>(rtxn, PRIMARY_KEY_KEY) self.main.get::<_, Str, OwnedType<u8>>(rtxn, PRIMARY_KEY_KEY)
} }
/* users ids documents ids */ /* external documents ids */
/// Writes the users ids documents ids, a user id is a byte slice (i.e. `[u8]`) /// Writes the external documents ids, it is a byte slice (i.e. `[u8]`)
/// and refers to an internal id (i.e. `u32`). /// and refers to an internal id (i.e. `u32`).
pub fn put_users_ids_documents_ids<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Map<A>) -> heed::Result<()> { pub fn put_external_documents_ids<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Map<A>) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS_KEY, fst.as_fst().as_bytes()) self.main.put::<_, Str, ByteSlice>(wtxn, EXTERNAL_DOCUMENTS_IDS_KEY, fst.as_fst().as_bytes())
} }
/// Returns the user ids documents ids map which associate the user ids (i.e. `[u8]`) /// Returns the external documents ids map which associate the external ids (i.e. `[u8]`)
/// with the internal ids (i.e. `u32`). /// with the internal ids (i.e. `u32`).
pub fn users_ids_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result<fst::Map<Cow<'t, [u8]>>> { pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result<fst::Map<Cow<'t, [u8]>>> {
match self.main.get::<_, Str, ByteSlice>(rtxn, USERS_IDS_DOCUMENTS_IDS_KEY)? { match self.main.get::<_, Str, ByteSlice>(rtxn, EXTERNAL_DOCUMENTS_IDS_KEY)? {
Some(bytes) => Ok(fst::Map::new(bytes)?.map_data(Cow::Borrowed)?), Some(bytes) => Ok(fst::Map::new(bytes)?.map_data(Cow::Borrowed)?),
None => Ok(fst::Map::default().map_data(Cow::Owned)?), None => Ok(fst::Map::default().map_data(Cow::Owned)?),
} }

View File

@ -27,7 +27,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
// We clean some of the main engine datastructures. // We clean some of the main engine datastructures.
self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
self.index.put_users_ids_documents_ids(self.wtxn, &fst::Map::default())?; self.index.put_external_documents_ids(self.wtxn, &fst::Map::default())?;
self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
// Clear the other databases. // Clear the other databases.

View File

@ -10,7 +10,7 @@ use super::ClearDocuments;
pub struct DeleteDocuments<'t, 'u, 'i> { pub struct DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
users_ids_documents_ids: fst::Map<Vec<u8>>, external_documents_ids: fst::Map<Vec<u8>>,
documents_ids: RoaringBitmap, documents_ids: RoaringBitmap,
} }
@ -20,14 +20,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
index: &'i Index, index: &'i Index,
) -> anyhow::Result<DeleteDocuments<'t, 'u, 'i>> ) -> anyhow::Result<DeleteDocuments<'t, 'u, 'i>>
{ {
let users_ids_documents_ids = index let external_documents_ids = index
.users_ids_documents_ids(wtxn)? .external_documents_ids(wtxn)?
.map_data(Cow::into_owned)?; .map_data(Cow::into_owned)?;
Ok(DeleteDocuments { Ok(DeleteDocuments {
wtxn, wtxn,
index, index,
users_ids_documents_ids, external_documents_ids,
documents_ids: RoaringBitmap::new(), documents_ids: RoaringBitmap::new(),
}) })
} }
@ -40,8 +40,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
self.documents_ids.union_with(docids); self.documents_ids.union_with(docids);
} }
pub fn delete_user_id(&mut self, user_id: &str) -> Option<u32> { pub fn delete_external_id(&mut self, external_id: &str) -> Option<u32> {
let docid = self.users_ids_documents_ids.get(user_id).map(|id| u32::try_from(id).unwrap())?; let docid = self.external_documents_ids.get(external_id).map(|id| u32::try_from(id).unwrap())?;
self.delete_document(docid); self.delete_document(docid);
Some(docid) Some(docid)
} }
@ -80,9 +80,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
documents, documents,
} = self.index; } = self.index;
// Retrieve the words and the users ids contained in the documents. // Retrieve the words and the external documents ids contained in the documents.
let mut words = Vec::new(); let mut words = Vec::new();
let mut users_ids = Vec::new(); let mut external_ids = Vec::new();
for docid in &self.documents_ids { for docid in &self.documents_ids {
// We create an iterator to be able to get the content and delete the document // We create an iterator to be able to get the content and delete the document
// content itself. It's faster to acquire a cursor to get and delete, // content itself. It's faster to acquire a cursor to get and delete,
@ -91,8 +91,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; let mut iter = documents.range_mut(self.wtxn, &(key..=key))?;
if let Some((_key, obkv)) = iter.next().transpose()? { if let Some((_key, obkv)) = iter.next().transpose()? {
if let Some(content) = obkv.get(id_field) { if let Some(content) = obkv.get(id_field) {
let user_id: SmallString32 = serde_json::from_slice(content).unwrap(); let external_id: SmallString32 = serde_json::from_slice(content).unwrap();
users_ids.push(user_id); external_ids.push(external_id);
} }
iter.del_current()?; iter.del_current()?;
} }
@ -109,30 +109,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
} }
} }
// We create the FST map of the users ids that we must delete. // We create the FST map of the external ids that we must delete.
users_ids.sort_unstable(); external_ids.sort_unstable();
let users_ids_to_delete = fst::Set::from_iter(users_ids.iter().map(AsRef::as_ref))?; let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?;
let users_ids_to_delete = fst::Map::from(users_ids_to_delete.into_fst()); let external_ids_to_delete = fst::Map::from(external_ids_to_delete.into_fst());
let new_users_ids_documents_ids = { let new_external_documents_ids = {
// We acquire the current users ids documents ids map and create // We acquire the current external documents ids map and create
// a difference operation between the current and to-delete users ids. // a difference operation between the current and to-delete external ids.
let users_ids_documents_ids = self.index.users_ids_documents_ids(self.wtxn)?; let external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
let difference = users_ids_documents_ids.op().add(&users_ids_to_delete).difference(); let difference = external_documents_ids.op().add(&external_ids_to_delete).difference();
// We stream the new users ids that does no more contains the to-delete users ids. // We stream the new external ids that does no more contains the to-delete external ids.
let mut iter = difference.into_stream(); let mut iter = difference.into_stream();
let mut new_users_ids_documents_ids_builder = fst::MapBuilder::memory(); let mut new_external_documents_ids_builder = fst::MapBuilder::memory();
while let Some((userid, docids)) = iter.next() { while let Some((external_id, docids)) = iter.next() {
new_users_ids_documents_ids_builder.insert(userid, docids[0].value)?; new_external_documents_ids_builder.insert(external_id, docids[0].value)?;
} }
// We create an FST map from the above builder. // We create an FST map from the above builder.
new_users_ids_documents_ids_builder.into_map() new_external_documents_ids_builder.into_map()
}; };
// We write the new users ids into the main database. // We write the new external ids into the main database.
self.index.put_users_ids_documents_ids(self.wtxn, &new_users_ids_documents_ids)?; self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
// Maybe we can improve the get performance of the words // Maybe we can improve the get performance of the words
// if we sort the words first, keeping the LMDB pages in cache. // if we sort the words first, keeping the LMDB pages in cache.
@ -169,7 +169,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let words_fst = self.index.words_fst(self.wtxn)?; let words_fst = self.index.words_fst(self.wtxn)?;
let difference = words_fst.op().add(&words_to_delete).difference(); let difference = words_fst.op().add(&words_to_delete).difference();
// We stream the new users ids that does no more contains the to-delete users ids. // We stream the new external ids that does no more contains the to-delete external ids.
let mut new_words_fst_builder = fst::SetBuilder::memory(); let mut new_words_fst_builder = fst::SetBuilder::memory();
new_words_fst_builder.extend_stream(difference.into_stream())?; new_words_fst_builder.extend_stream(difference.into_stream())?;

View File

@ -287,7 +287,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let TransformOutput { let TransformOutput {
primary_key, primary_key,
fields_ids_map, fields_ids_map,
users_ids_documents_ids, external_documents_ids,
new_documents_ids, new_documents_ids,
replaced_documents_ids, replaced_documents_ids,
documents_count, documents_count,
@ -472,8 +472,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// We write the primary key field id into the main database // We write the primary key field id into the main database
self.index.put_primary_key(self.wtxn, primary_key)?; self.index.put_primary_key(self.wtxn, primary_key)?;
// We write the users_ids_documents_ids into the main database. // We write the external documents ids into the main database.
self.index.put_users_ids_documents_ids(self.wtxn, &users_ids_documents_ids)?; self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
// We merge the new documents ids with the existing ones. // We merge the new documents ids with the existing ones.
documents_ids.union_with(&new_documents_ids); documents_ids.union_with(&new_documents_ids);

View File

@ -20,14 +20,14 @@ use super::{create_writer, create_sorter, IndexDocumentsMethod};
pub struct TransformOutput { pub struct TransformOutput {
pub primary_key: u8, pub primary_key: u8,
pub fields_ids_map: FieldsIdsMap, pub fields_ids_map: FieldsIdsMap,
pub users_ids_documents_ids: fst::Map<Vec<u8>>, pub external_documents_ids: fst::Map<Vec<u8>>,
pub new_documents_ids: RoaringBitmap, pub new_documents_ids: RoaringBitmap,
pub replaced_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap,
pub documents_count: usize, pub documents_count: usize,
pub documents_file: File, pub documents_file: File,
} }
/// Extract the users ids, deduplicate and compute the new internal documents ids /// Extract the external ids, deduplicate and compute the new internal documents ids
/// and fields ids, writing all the documents under their internal ids into a final file. /// and fields ids, writing all the documents under their internal ids into a final file.
/// ///
/// Outputs the new `FieldsIdsMap`, the new `UsersIdsDocumentsIds` map, the new documents ids, /// Outputs the new `FieldsIdsMap`, the new `UsersIdsDocumentsIds` map, the new documents ids,
@ -74,7 +74,7 @@ impl Transform<'_, '_> {
F: Fn(UpdateIndexingStep) + Sync, F: Fn(UpdateIndexingStep) + Sync,
{ {
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap(); let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
let primary_key = self.index.primary_key(self.rtxn)?; let primary_key = self.index.primary_key(self.rtxn)?;
// Deserialize the whole batch of documents in memory. // Deserialize the whole batch of documents in memory.
@ -116,7 +116,7 @@ impl Transform<'_, '_> {
return Ok(TransformOutput { return Ok(TransformOutput {
primary_key, primary_key,
fields_ids_map, fields_ids_map,
users_ids_documents_ids: fst::Map::default(), external_documents_ids: fst::Map::default(),
new_documents_ids: RoaringBitmap::new(), new_documents_ids: RoaringBitmap::new(),
replaced_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(),
documents_count: 0, documents_count: 0,
@ -172,7 +172,7 @@ impl Transform<'_, '_> {
// We retrieve the user id from the document based on the primary key name, // We retrieve the user id from the document based on the primary key name,
// if the document id isn't present we generate a uuid. // if the document id isn't present we generate a uuid.
let user_id = match document.get(&primary_key_name) { let external_id = match document.get(&primary_key_name) {
Some(value) => match value { Some(value) => match value {
Value::String(string) => Cow::Borrowed(string.as_str()), Value::String(string) => Cow::Borrowed(string.as_str()),
Value::Number(number) => Cow::Owned(number.to_string()), Value::Number(number) => Cow::Owned(number.to_string()),
@ -200,19 +200,19 @@ impl Transform<'_, '_> {
} }
else if field_id == primary_key { else if field_id == primary_key {
// We validate the document id [a-zA-Z0-9\-_]. // We validate the document id [a-zA-Z0-9\-_].
let user_id = match validate_document_id(&user_id) { let external_id = match validate_document_id(&external_id) {
Some(valid) => valid, Some(valid) => valid,
None => return Err(anyhow!("invalid document id: {:?}", user_id)), None => return Err(anyhow!("invalid document id: {:?}", external_id)),
}; };
// We serialize the document id. // We serialize the document id.
serde_json::to_writer(&mut json_buffer, &user_id)?; serde_json::to_writer(&mut json_buffer, &external_id)?;
writer.insert(field_id, &json_buffer)?; writer.insert(field_id, &json_buffer)?;
} }
} }
// We use the extracted/generated user id as the key for this document. // We use the extracted/generated user id as the key for this document.
sorter.insert(user_id.as_bytes(), &obkv_buffer)?; sorter.insert(external_id.as_bytes(), &obkv_buffer)?;
documents_count += 1; documents_count += 1;
} }
@ -227,7 +227,7 @@ impl Transform<'_, '_> {
primary_key, primary_key,
fields_ids_map, fields_ids_map,
documents_count, documents_count,
users_ids_documents_ids, external_documents_ids,
progress_callback, progress_callback,
) )
} }
@ -238,7 +238,7 @@ impl Transform<'_, '_> {
F: Fn(UpdateIndexingStep) + Sync, F: Fn(UpdateIndexingStep) + Sync,
{ {
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap(); let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
let mut csv = csv::Reader::from_reader(reader); let mut csv = csv::Reader::from_reader(reader);
let headers = csv.headers()?; let headers = csv.headers()?;
@ -252,7 +252,7 @@ impl Transform<'_, '_> {
} }
// Extract the position of the primary key in the current headers, None if not found. // Extract the position of the primary key in the current headers, None if not found.
let user_id_pos = match primary_key { let external_id_pos = match primary_key {
Some(primary_key) => { Some(primary_key) => {
// Te primary key have is known so we must find the position in the CSV headers. // Te primary key have is known so we must find the position in the CSV headers.
let name = fields_ids_map.name(primary_key).expect("found the primary key name"); let name = fields_ids_map.name(primary_key).expect("found the primary key name");
@ -263,7 +263,7 @@ impl Transform<'_, '_> {
// Returns the field id in the fileds ids map, create an "id" field // Returns the field id in the fileds ids map, create an "id" field
// in case it is not in the current headers. // in case it is not in the current headers.
let primary_key_field_id = match user_id_pos { let primary_key_field_id = match external_id_pos {
Some(pos) => fields_ids_map.id(&headers[pos]).expect("found the primary key"), Some(pos) => fields_ids_map.id(&headers[pos]).expect("found the primary key"),
None => { None => {
if !self.autogenerate_docids { if !self.autogenerate_docids {
@ -294,7 +294,7 @@ impl Transform<'_, '_> {
); );
// We write into the sorter to merge and deduplicate the documents // We write into the sorter to merge and deduplicate the documents
// based on the users ids. // based on the external ids.
let mut json_buffer = Vec::new(); let mut json_buffer = Vec::new();
let mut obkv_buffer = Vec::new(); let mut obkv_buffer = Vec::new();
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
@ -312,13 +312,13 @@ impl Transform<'_, '_> {
} }
// We extract the user id if we know where it is or generate an UUID V4 otherwise. // We extract the user id if we know where it is or generate an UUID V4 otherwise.
let user_id = match user_id_pos { let external_id = match external_id_pos {
Some(pos) => { Some(pos) => {
let user_id = &record[pos]; let external_id = &record[pos];
// We validate the document id [a-zA-Z0-9\-_]. // We validate the document id [a-zA-Z0-9\-_].
match validate_document_id(&user_id) { match validate_document_id(&external_id) {
Some(valid) => valid, Some(valid) => valid,
None => return Err(anyhow!("invalid document id: {:?}", user_id)), None => return Err(anyhow!("invalid document id: {:?}", external_id)),
} }
}, },
None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
@ -328,7 +328,7 @@ impl Transform<'_, '_> {
// we return the generated document id instead of the record field. // we return the generated document id instead of the record field.
let iter = fields_ids.iter() let iter = fields_ids.iter()
.map(|(fi, i)| { .map(|(fi, i)| {
let field = if *fi == primary_key_field_id { user_id } else { &record[*i] }; let field = if *fi == primary_key_field_id { external_id } else { &record[*i] };
(fi, field) (fi, field)
}); });
@ -341,7 +341,7 @@ impl Transform<'_, '_> {
} }
// We use the extracted/generated user id as the key for this document. // We use the extracted/generated user id as the key for this document.
sorter.insert(user_id, &obkv_buffer)?; sorter.insert(external_id, &obkv_buffer)?;
documents_count += 1; documents_count += 1;
} }
@ -356,7 +356,7 @@ impl Transform<'_, '_> {
primary_key_field_id, primary_key_field_id,
fields_ids_map, fields_ids_map,
documents_count, documents_count,
users_ids_documents_ids, external_documents_ids,
progress_callback, progress_callback,
) )
} }
@ -370,7 +370,7 @@ impl Transform<'_, '_> {
primary_key: u8, primary_key: u8,
fields_ids_map: FieldsIdsMap, fields_ids_map: FieldsIdsMap,
approximate_number_of_documents: usize, approximate_number_of_documents: usize,
users_ids_documents_ids: fst::Map<Cow<'_, [u8]>>, external_documents_ids: fst::Map<Cow<'_, [u8]>>,
progress_callback: F, progress_callback: F,
) -> anyhow::Result<TransformOutput> ) -> anyhow::Result<TransformOutput>
where where
@ -388,7 +388,7 @@ impl Transform<'_, '_> {
self.max_nb_chunks, self.max_nb_chunks,
self.max_memory, self.max_memory,
); );
let mut new_users_ids_documents_ids_builder = fst::MapBuilder::memory(); let mut new_external_documents_ids_builder = fst::MapBuilder::memory();
let mut replaced_documents_ids = RoaringBitmap::new(); let mut replaced_documents_ids = RoaringBitmap::new();
let mut new_documents_ids = RoaringBitmap::new(); let mut new_documents_ids = RoaringBitmap::new();
let mut obkv_buffer = Vec::new(); let mut obkv_buffer = Vec::new();
@ -396,7 +396,7 @@ impl Transform<'_, '_> {
// While we write into final file we get or generate the internal documents ids. // While we write into final file we get or generate the internal documents ids.
let mut documents_count = 0; let mut documents_count = 0;
let mut iter = sorter.into_iter()?; let mut iter = sorter.into_iter()?;
while let Some((user_id, update_obkv)) = iter.next()? { while let Some((external_id, update_obkv)) = iter.next()? {
if self.log_every_n.map_or(false, |len| documents_count % len == 0) { if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
@ -405,9 +405,9 @@ impl Transform<'_, '_> {
}); });
} }
let (docid, obkv) = match users_ids_documents_ids.get(user_id) { let (docid, obkv) = match external_documents_ids.get(external_id) {
Some(docid) => { Some(docid) => {
// If we find the user id in the current users ids documents ids map // If we find the user id in the current external documents ids map
// we use it and insert it in the list of replaced documents. // we use it and insert it in the list of replaced documents.
let docid = u32::try_from(docid).expect("valid document id"); let docid = u32::try_from(docid).expect("valid document id");
replaced_documents_ids.insert(docid); replaced_documents_ids.insert(docid);
@ -427,11 +427,11 @@ impl Transform<'_, '_> {
} }
}, },
None => { None => {
// If this user id is new we add it to the users ids documents ids map // If this user id is new we add it to the external documents ids map
// for new ids and into the list of new documents. // for new ids and into the list of new documents.
let new_docid = available_documents_ids.next() let new_docid = available_documents_ids.next()
.context("no more available documents ids")?; .context("no more available documents ids")?;
new_users_ids_documents_ids_builder.insert(user_id, new_docid as u64)?; new_external_documents_ids_builder.insert(external_id, new_docid as u64)?;
new_documents_ids.insert(new_docid); new_documents_ids.insert(new_docid);
(new_docid, update_obkv) (new_docid, update_obkv)
}, },
@ -457,28 +457,28 @@ impl Transform<'_, '_> {
let mut documents_file = writer.into_inner()?; let mut documents_file = writer.into_inner()?;
documents_file.seek(SeekFrom::Start(0))?; documents_file.seek(SeekFrom::Start(0))?;
// We create the union between the existing users ids documents ids with the new ones. // We create the union between the existing external documents ids with the new ones.
let new_users_ids_documents_ids = new_users_ids_documents_ids_builder.into_map(); let new_external_documents_ids = new_external_documents_ids_builder.into_map();
let union_op = fst::map::OpBuilder::new() let union_op = fst::map::OpBuilder::new()
.add(&users_ids_documents_ids) .add(&external_documents_ids)
.add(&new_users_ids_documents_ids) .add(&new_external_documents_ids)
.r#union(); .r#union();
// We stream and merge the new users ids documents ids map with the existing one. // We stream and merge the new external documents ids map with the existing one.
let before_docids_merging = Instant::now(); let before_docids_merging = Instant::now();
let mut users_ids_documents_ids_builder = fst::MapBuilder::memory(); let mut external_documents_ids_builder = fst::MapBuilder::memory();
let mut iter = union_op.into_stream(); let mut iter = union_op.into_stream();
while let Some((user_id, vals)) = iter.next() { while let Some((external_id, vals)) = iter.next() {
assert_eq!(vals.len(), 1, "there must be exactly one document id"); assert_eq!(vals.len(), 1, "there must be exactly one document id");
users_ids_documents_ids_builder.insert(user_id, vals[0].value)?; external_documents_ids_builder.insert(external_id, vals[0].value)?;
} }
info!("Documents users ids merging took {:.02?}", before_docids_merging.elapsed()); info!("Documents external merging took {:.02?}", before_docids_merging.elapsed());
Ok(TransformOutput { Ok(TransformOutput {
primary_key, primary_key,
fields_ids_map, fields_ids_map,
users_ids_documents_ids: users_ids_documents_ids_builder.into_map(), external_documents_ids: external_documents_ids_builder.into_map(),
new_documents_ids, new_documents_ids,
replaced_documents_ids, replaced_documents_ids,
documents_count, documents_count,
@ -496,7 +496,7 @@ impl Transform<'_, '_> {
) -> anyhow::Result<TransformOutput> ) -> anyhow::Result<TransformOutput>
{ {
let current_fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let current_fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?;
let documents_count = documents_ids.len() as usize; let documents_count = documents_ids.len() as usize;
@ -531,7 +531,7 @@ impl Transform<'_, '_> {
Ok(TransformOutput { Ok(TransformOutput {
primary_key, primary_key,
fields_ids_map, fields_ids_map,
users_ids_documents_ids: users_ids_documents_ids.map_data(Cow::into_owned)?, external_documents_ids: external_documents_ids.map_data(Cow::into_owned)?,
new_documents_ids: documents_ids, new_documents_ids: documents_ids,
replaced_documents_ids: RoaringBitmap::default(), replaced_documents_ids: RoaringBitmap::default(),
documents_count, documents_count,