improve the way we handle the fsts

This commit is contained in:
Tamo 2023-05-22 11:15:14 +02:00
parent 7f619ff0e4
commit 602ad98cb8
4 changed files with 30 additions and 24 deletions

View File

@ -56,7 +56,6 @@ itertools = "0.10.5"
log = "0.4.17"
logging_timer = "1.1.0"
csv = "1.2.1"
fastrand = "1.9.0"
[dev-dependencies]
mimalloc = { version = "0.1.29", default-features = false }
@ -65,7 +64,10 @@ insta = "1.29.0"
maplit = "1.0.2"
md5 = "0.7.0"
rand = {version = "0.8.5", features = ["small_rng"] }
# fuzzing
arbitrary = { version = "1.3.0", features = ["derive"] }
fastrand = "1.9.0"
[target.'cfg(fuzzing)'.dev-dependencies]
fuzzcheck = "0.12.1"

View File

@ -106,22 +106,30 @@ impl<'a> ExternalDocumentsIds<'a> {
map
}
/// Return an fst of the combined hard and soft deleted ID.
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
if self.soft.is_empty() {
return Ok(Cow::Borrowed(&self.hard));
}
let union_op = self.hard.op().add(&self.soft).r#union();
let mut iter = union_op.into_stream();
let mut new_hard_builder = fst::MapBuilder::memory();
while let Some((external_id, marked_docids)) = iter.next() {
let value = indexed_last_value(marked_docids).unwrap();
if value != DELETED_ID {
new_hard_builder.insert(external_id, value)?;
}
}
drop(iter);
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
}
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
if self.soft.len() >= self.hard.len() / 2 {
let union_op = self.hard.op().add(&self.soft).r#union();
let mut iter = union_op.into_stream();
let mut new_hard_builder = fst::MapBuilder::memory();
while let Some((external_id, marked_docids)) = iter.next() {
let value = indexed_last_value(marked_docids).unwrap();
if value != DELETED_ID {
new_hard_builder.insert(external_id, value)?;
}
}
drop(iter);
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
self.hard = self.to_fst()?.into_owned();
self.soft = fst::Map::default().map_data(Cow::Owned)?;
}

View File

@ -198,7 +198,6 @@ where
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
}
let output = self
.transform
.take()
@ -221,7 +220,6 @@ where
}
let indexed_documents = output.documents_count as u64;
let number_of_documents = self.execute_raw(output)?;
Ok(DocumentAdditionResult { indexed_documents, number_of_documents })

View File

@ -57,8 +57,8 @@ pub struct Transform<'a, 'i> {
original_sorter: grenad::Sorter<MergeFn>,
flattened_sorter: grenad::Sorter<MergeFn>,
pub replaced_documents_ids: RoaringBitmap,
pub new_documents_ids: RoaringBitmap,
replaced_documents_ids: RoaringBitmap,
new_documents_ids: RoaringBitmap,
// To increase the cache locality and decrease the heap usage we use compact smartstring.
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
documents_count: usize,
@ -653,9 +653,7 @@ impl<'a, 'i> Transform<'a, 'i> {
primary_key,
fields_ids_map: self.fields_ids_map,
field_distribution,
new_external_documents_ids: new_external_documents_ids
.map_data(|c| Cow::Owned(c))
.unwrap(),
new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(),
new_documents_ids: self.new_documents_ids,
replaced_documents_ids: self.replaced_documents_ids,
documents_count: self.documents_count,
@ -689,8 +687,8 @@ impl<'a, 'i> Transform<'a, 'i> {
let new_external_documents_ids = {
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
// it is safe to get the hard document IDs
external_documents_ids.into_static().hard
// This call should be free and can't fail since the previous method merged both fsts.
external_documents_ids.into_static().to_fst()?.into_owned()
};
let documents_ids = self.index.documents_ids(wtxn)?;