From 27f3ef5f7a0017e2e40bd88bf4eed40857e91070 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 22 Nov 2020 17:53:33 +0100 Subject: [PATCH] Use the new ExternalDocumentsIds struct in the engine --- src/index.rs | 20 +++++++------- src/lib.rs | 1 + src/update/clear_documents.rs | 4 +-- src/update/delete_documents.rs | 35 +++++++------------------ src/update/index_documents/transform.rs | 30 +++++++-------------- 5 files changed, 32 insertions(+), 58 deletions(-) diff --git a/src/index.rs b/src/index.rs index 76ddd4fef..ccaba4ca6 100644 --- a/src/index.rs +++ b/src/index.rs @@ -7,11 +7,10 @@ use heed::types::*; use heed::{PolyDatabase, Database, RwTxn, RoTxn}; use roaring::RoaringBitmap; -use crate::external_documents_ids::ExternalDocumentsIds; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::Search; -use crate::{BEU32, DocumentId}; +use crate::{BEU32, DocumentId, ExternalDocumentsIds}; use crate::{ RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, @@ -143,14 +142,15 @@ impl Index { pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result> { let hard = self.main.get::<_, Str, ByteSlice>(rtxn, HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; let soft = self.main.get::<_, Str, ByteSlice>(rtxn, SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; - match hard.zip(soft) { - Some((hard, soft)) => { - let hard = fst::Map::new(hard)?.map_data(Cow::Borrowed)?; - let soft = fst::Map::new(soft)?.map_data(Cow::Borrowed)?; - Ok(ExternalDocumentsIds::new(hard, soft)) - }, - None => Ok(ExternalDocumentsIds::default()), - } + let hard = match hard { + Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, + None => fst::Map::default().map_data(Cow::Owned)?, + }; + let soft = match soft { + Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?, + None => fst::Map::default().map_data(Cow::Owned)?, + }; + Ok(ExternalDocumentsIds::new(hard, soft)) } /* fields ids map */ diff --git a/src/lib.rs b/src/lib.rs index fd0156bfa..12a24a59c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; +pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::index::Index; pub use self::search::{Search, SearchResult}; diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index 0e89d43b7..447dca8b4 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -1,5 +1,5 @@ use roaring::RoaringBitmap; -use crate::Index; +use crate::{ExternalDocumentsIds, Index}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -27,7 +27,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; - self.index.put_external_documents_ids(self.wtxn, &fst::Map::default())?; + self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; // Clear the other databases. diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index 5ccce35f6..1913ac033 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -1,16 +1,13 @@ -use std::borrow::Cow; -use std::convert::TryFrom; - -use fst::{IntoStreamer, Streamer}; +use fst::IntoStreamer; use roaring::RoaringBitmap; -use crate::{Index, BEU32, SmallString32}; +use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - external_documents_ids: fst::Map>, + external_documents_ids: ExternalDocumentsIds<'static>, documents_ids: RoaringBitmap, } @@ -22,7 +19,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { { let external_documents_ids = index .external_documents_ids(wtxn)? - .map_data(Cow::into_owned)?; + .into_static(); Ok(DeleteDocuments { wtxn, @@ -41,7 +38,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } pub fn delete_external_id(&mut self, external_id: &str) -> Option { - let docid = self.external_documents_ids.get(external_id).map(|id| u32::try_from(id).unwrap())?; + let docid = self.external_documents_ids.get(external_id)?; self.delete_document(docid); Some(docid) } @@ -112,26 +109,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We create the FST map of the external ids that we must delete. external_ids.sort_unstable(); let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?; - let external_ids_to_delete = fst::Map::from(external_ids_to_delete.into_fst()); - let new_external_documents_ids = { - // We acquire the current external documents ids map and create - // a difference operation between the current and to-delete external ids. - let external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - let difference = external_documents_ids.op().add(&external_ids_to_delete).difference(); - - // We stream the new external ids that does no more contains the to-delete external ids. - let mut iter = difference.into_stream(); - let mut new_external_documents_ids_builder = fst::MapBuilder::memory(); - while let Some((external_id, docids)) = iter.next() { - new_external_documents_ids_builder.insert(external_id, docids[0].value)?; - } - - // We create an FST map from the above builder. - new_external_documents_ids_builder.into_map() - }; + // We acquire the current external documents ids map... + let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; + // ...and remove the to-delete external ids. + new_external_documents_ids.delete_ids(external_ids_to_delete)?; // We write the new external ids into the main database. + let new_external_documents_ids = new_external_documents_ids.into_static(); self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; // Maybe we can improve the get performance of the words diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs index 3c6acd1a9..54619ed4f 100644 --- a/src/update/index_documents/transform.rs +++ b/src/update/index_documents/transform.rs @@ -6,13 +6,12 @@ use std::iter::Peekable; use std::time::Instant; use anyhow::{anyhow, Context}; -use fst::{IntoStreamer, Streamer}; use grenad::CompressionType; use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use crate::{BEU32, MergeFn, Index, FieldsIdsMap}; +use crate::{BEU32, MergeFn, Index, FieldsIdsMap, ExternalDocumentsIds}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use super::merge_function::merge_two_obkvs; use super::{create_writer, create_sorter, IndexDocumentsMethod}; @@ -20,7 +19,7 @@ use super::{create_writer, create_sorter, IndexDocumentsMethod}; pub struct TransformOutput { pub primary_key: u8, pub fields_ids_map: FieldsIdsMap, - pub external_documents_ids: fst::Map>, + pub external_documents_ids: ExternalDocumentsIds<'static>, pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, pub documents_count: usize, @@ -116,7 +115,7 @@ impl Transform<'_, '_> { return Ok(TransformOutput { primary_key, fields_ids_map, - external_documents_ids: fst::Map::default(), + external_documents_ids: ExternalDocumentsIds::default(), new_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(), documents_count: 0, @@ -370,7 +369,7 @@ impl Transform<'_, '_> { primary_key: u8, fields_ids_map: FieldsIdsMap, approximate_number_of_documents: usize, - external_documents_ids: fst::Map>, + mut external_documents_ids: ExternalDocumentsIds<'_>, progress_callback: F, ) -> anyhow::Result where @@ -457,28 +456,17 @@ impl Transform<'_, '_> { let mut documents_file = writer.into_inner()?; documents_file.seek(SeekFrom::Start(0))?; - // We create the union between the existing external documents ids with the new ones. - let new_external_documents_ids = new_external_documents_ids_builder.into_map(); - let union_op = fst::map::OpBuilder::new() - .add(&external_documents_ids) - .add(&new_external_documents_ids) - .r#union(); - - // We stream and merge the new external documents ids map with the existing one. let before_docids_merging = Instant::now(); - let mut external_documents_ids_builder = fst::MapBuilder::memory(); - let mut iter = union_op.into_stream(); - while let Some((external_id, vals)) = iter.next() { - assert_eq!(vals.len(), 1, "there must be exactly one document id"); - external_documents_ids_builder.insert(external_id, vals[0].value)?; - } + // We merge the new external ids with existing external documents ids. + let new_external_documents_ids = new_external_documents_ids_builder.into_map(); + external_documents_ids.insert_ids(&new_external_documents_ids)?; info!("Documents external merging took {:.02?}", before_docids_merging.elapsed()); Ok(TransformOutput { primary_key, fields_ids_map, - external_documents_ids: external_documents_ids_builder.into_map(), + external_documents_ids: external_documents_ids.into_static(), new_documents_ids, replaced_documents_ids, documents_count, @@ -531,7 +519,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map, - external_documents_ids: external_documents_ids.map_data(Cow::into_owned)?, + external_documents_ids: external_documents_ids.into_static(), new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), documents_count,