3819: Remove the `docid_word_positions` database r=Kerollmops a=loiclec

Remove the `docid_word_positions` database, which was only used during deletion operations. In the process, also fixes https://github.com/meilisearch/meilisearch/issues/3816




Co-authored-by: Loïc Lecrenier <loic.lecrenier@icloud.com>
This commit is contained in:
meili-bors[bot] 2023-06-07 09:53:25 +00:00 committed by GitHub
commit 9dcf1da59d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 95 additions and 177 deletions

View File

@ -21,10 +21,9 @@ use crate::heed_codec::facet::{
}; };
use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
use crate::{ use crate::{
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
Search, U8StrStrCodec, BEU16, BEU32,
}; };
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@ -111,9 +110,6 @@ pub struct Index {
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>, pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
/// Maps a word and a document id (u32) to all the positions where the given word appears.
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears. /// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
@ -177,7 +173,6 @@ impl Index {
let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?; let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
let exact_word_prefix_docids = let exact_word_prefix_docids =
env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?; env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?;
let docid_word_positions = env.create_database(&mut wtxn, Some(DOCID_WORD_POSITIONS))?;
let word_pair_proximity_docids = let word_pair_proximity_docids =
env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
let script_language_docids = let script_language_docids =
@ -220,7 +215,6 @@ impl Index {
exact_word_docids, exact_word_docids,
word_prefix_docids, word_prefix_docids,
exact_word_prefix_docids, exact_word_prefix_docids,
docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
script_language_docids, script_language_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,

View File

@ -5,52 +5,6 @@
#[global_allocator] #[global_allocator]
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
// #[cfg(test)]
// pub mod allocator {
// use std::alloc::{GlobalAlloc, System};
// use std::sync::atomic::{self, AtomicI64};
// #[global_allocator]
// pub static ALLOC: CountingAlloc = CountingAlloc {
// max_resident: AtomicI64::new(0),
// resident: AtomicI64::new(0),
// allocated: AtomicI64::new(0),
// };
// pub struct CountingAlloc {
// pub max_resident: AtomicI64,
// pub resident: AtomicI64,
// pub allocated: AtomicI64,
// }
// unsafe impl GlobalAlloc for CountingAlloc {
// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
// let old_resident =
// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
// let resident = old_resident + layout.size() as i64;
// self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst);
// // if layout.size() > 1_000_000 {
// // eprintln!(
// // "allocating {} with new resident size: {resident}",
// // layout.size() / 1_000_000
// // );
// // // let trace = std::backtrace::Backtrace::capture();
// // // let t = trace.to_string();
// // // eprintln!("{t}");
// // }
// System.alloc(layout)
// }
// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
// System.dealloc(ptr, layout)
// }
// }
// }
#[macro_use] #[macro_use]
pub mod documents; pub mod documents;

View File

@ -89,7 +89,6 @@ Create a snapshot test of the given database.
- `exact_word_docids` - `exact_word_docids`
- `word_prefix_docids` - `word_prefix_docids`
- `exact_word_prefix_docids` - `exact_word_prefix_docids`
- `docid_word_positions`
- `word_pair_proximity_docids` - `word_pair_proximity_docids`
- `word_prefix_pair_proximity_docids` - `word_prefix_pair_proximity_docids`
- `word_position_docids` - `word_position_docids`
@ -217,11 +216,6 @@ pub fn snap_exact_word_prefix_docids(index: &Index) -> String {
&format!("{s:<16} {}", display_bitmap(&b)) &format!("{s:<16} {}", display_bitmap(&b))
}) })
} }
pub fn snap_docid_word_positions(index: &Index) -> String {
make_db_snap_from_iter!(index, docid_word_positions, |((idx, s), b)| {
&format!("{idx:<6} {s:<16} {}", display_bitmap(&b))
})
}
pub fn snap_word_pair_proximity_docids(index: &Index) -> String { pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, word_pair_proximity_docids, |((proximity, word1, word2), b)| { make_db_snap_from_iter!(index, word_pair_proximity_docids, |((proximity, word1, word2), b)| {
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
@ -477,9 +471,6 @@ macro_rules! full_snap_of_db {
($index:ident, exact_word_prefix_docids) => {{ ($index:ident, exact_word_prefix_docids) => {{
$crate::snapshot_tests::snap_exact_word_prefix_docids(&$index) $crate::snapshot_tests::snap_exact_word_prefix_docids(&$index)
}}; }};
($index:ident, docid_word_positions) => {{
$crate::snapshot_tests::snap_docid_word_positions(&$index)
}};
($index:ident, word_pair_proximity_docids) => {{ ($index:ident, word_pair_proximity_docids) => {{
$crate::snapshot_tests::snap_word_pair_proximity_docids(&$index) $crate::snapshot_tests::snap_word_pair_proximity_docids(&$index)
}}; }};

View File

@ -23,7 +23,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
exact_word_docids, exact_word_docids,
word_prefix_docids, word_prefix_docids,
exact_word_prefix_docids, exact_word_prefix_docids,
docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
prefix_word_pair_proximity_docids, prefix_word_pair_proximity_docids,
@ -80,7 +79,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
exact_word_docids.clear(self.wtxn)?; exact_word_docids.clear(self.wtxn)?;
word_prefix_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?;
exact_word_prefix_docids.clear(self.wtxn)?; exact_word_prefix_docids.clear(self.wtxn)?;
docid_word_positions.clear(self.wtxn)?;
word_pair_proximity_docids.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?;
word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?;
prefix_word_pair_proximity_docids.clear(self.wtxn)?; prefix_word_pair_proximity_docids.clear(self.wtxn)?;
@ -141,7 +139,6 @@ mod tests {
assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());

View File

@ -1,5 +1,5 @@
use std::collections::btree_map::Entry; use std::collections::btree_map::Entry;
use std::collections::{HashMap, HashSet}; use std::collections::{BTreeSet, HashMap, HashSet};
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice}; use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
@ -15,8 +15,7 @@ use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetCodec; use crate::heed_codec::facet::FieldDocIdFacetCodec;
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::{ use crate::{
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
SmallString32, BEU32,
}; };
pub struct DeleteDocuments<'t, 'u, 'i> { pub struct DeleteDocuments<'t, 'u, 'i> {
@ -232,7 +231,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
exact_word_docids, exact_word_docids,
word_prefix_docids, word_prefix_docids,
exact_word_prefix_docids, exact_word_prefix_docids,
docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
field_id_word_count_docids, field_id_word_count_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
@ -251,23 +249,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
facet_id_is_empty_docids, facet_id_is_empty_docids,
documents, documents,
} = self.index; } = self.index;
// Remove from the documents database
// Retrieve the words contained in the documents.
let mut words = Vec::new();
for docid in &self.to_delete_docids { for docid in &self.to_delete_docids {
documents.delete(self.wtxn, &BEU32::new(docid))?; documents.delete(self.wtxn, &BEU32::new(docid))?;
// We iterate through the words positions of the document id, retrieve the word and delete the positions.
// We create an iterator to be able to get the content and delete the key-value itself.
// It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once.
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
while let Some(result) = iter.next() {
let ((_docid, word), _positions) = result?;
// This boolean will indicate if we must remove this word from the words FST.
words.push((SmallString32::from(word), false));
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
}
} }
// We acquire the current external documents ids map... // We acquire the current external documents ids map...
// Note that its soft-deleted document ids field will be equal to the `to_delete_docids` // Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
@ -278,42 +262,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let new_external_documents_ids = new_external_documents_ids.into_static(); let new_external_documents_ids = new_external_documents_ids.into_static();
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
// Maybe we can improve the get performance of the words let mut words_to_keep = BTreeSet::default();
// if we sort the words first, keeping the LMDB pages in cache. let mut words_to_delete = BTreeSet::default();
words.sort_unstable();
// We iterate over the words and delete the documents ids // We iterate over the words and delete the documents ids
// from the word docids database. // from the word docids database.
for (word, must_remove) in &mut words { remove_from_word_docids(
remove_from_word_docids( self.wtxn,
self.wtxn, word_docids,
word_docids, &self.to_delete_docids,
word.as_str(), &mut words_to_keep,
must_remove, &mut words_to_delete,
&self.to_delete_docids, )?;
)?; remove_from_word_docids(
self.wtxn,
remove_from_word_docids( exact_word_docids,
self.wtxn, &self.to_delete_docids,
exact_word_docids, &mut words_to_keep,
word.as_str(), &mut words_to_delete,
must_remove, )?;
&self.to_delete_docids,
)?;
}
// We construct an FST set that contains the words to delete from the words FST. // We construct an FST set that contains the words to delete from the words FST.
let words_to_delete = let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?;
words.iter().filter_map(
|(word, must_remove)| {
if *must_remove {
Some(word.as_str())
} else {
None
}
},
);
let words_to_delete = fst::Set::from_iter(words_to_delete)?;
let new_words_fst = { let new_words_fst = {
// We retrieve the current words FST from the database. // We retrieve the current words FST from the database.
@ -532,23 +501,24 @@ fn remove_from_word_prefix_docids(
fn remove_from_word_docids( fn remove_from_word_docids(
txn: &mut heed::RwTxn, txn: &mut heed::RwTxn,
db: &heed::Database<Str, RoaringBitmapCodec>, db: &heed::Database<Str, RoaringBitmapCodec>,
word: &str,
must_remove: &mut bool,
to_remove: &RoaringBitmap, to_remove: &RoaringBitmap,
words_to_keep: &mut BTreeSet<String>,
words_to_remove: &mut BTreeSet<String>,
) -> Result<()> { ) -> Result<()> {
// We create an iterator to be able to get the content and delete the word docids. // We create an iterator to be able to get the content and delete the word docids.
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing // It's faster to acquire a cursor to get and delete or put, as we avoid traversing
// the LMDB B-Tree two times but only once. // the LMDB B-Tree two times but only once.
let mut iter = db.prefix_iter_mut(txn, word)?; let mut iter = db.iter_mut(txn)?;
if let Some((key, mut docids)) = iter.next().transpose()? { while let Some((key, mut docids)) = iter.next().transpose()? {
if key == word { let previous_len = docids.len();
let previous_len = docids.len(); docids -= to_remove;
docids -= to_remove; if docids.is_empty() {
if docids.is_empty() { // safety: we don't keep references from inside the LMDB database.
// safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? };
unsafe { iter.del_current()? }; words_to_remove.insert(key.to_owned());
*must_remove = true; } else {
} else if docids.len() != previous_len { words_to_keep.insert(key.to_owned());
if docids.len() != previous_len {
let key = key.to_owned(); let key = key.to_owned();
// safety: we don't keep references from inside the LMDB database. // safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(&key, &docids)? }; unsafe { iter.put_current(&key, &docids)? };
@ -627,7 +597,7 @@ mod tests {
use super::*; use super::*;
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::{db_snap, Filter}; use crate::{db_snap, Filter, Search};
fn delete_documents<'t>( fn delete_documents<'t>(
wtxn: &mut RwTxn<'t, '_>, wtxn: &mut RwTxn<'t, '_>,
@ -1199,4 +1169,52 @@ mod tests {
DeletionStrategy::AlwaysSoft, DeletionStrategy::AlwaysSoft,
); );
} }
#[test]
fn delete_words_exact_attributes() {
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_primary_key(S("id"));
settings.set_searchable_fields(vec![S("text"), S("exact")]);
settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
})
.unwrap();
index
.add_documents(documents!([
{ "id": 0, "text": "hello" },
{ "id": 1, "exact": "hello"}
]))
.unwrap();
db_snap!(index, word_docids, 1, @r###"
hello [0, ]
"###);
db_snap!(index, exact_word_docids, 1, @r###"
hello [1, ]
"###);
db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
let mut wtxn = index.write_txn().unwrap();
let deleted_internal_ids =
delete_documents(&mut wtxn, &index, &["1"], DeletionStrategy::AlwaysHard);
wtxn.commit().unwrap();
db_snap!(index, word_docids, 2, @r###"
hello [0, ]
"###);
db_snap!(index, exact_word_docids, 2, @"");
db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
let txn = index.read_txn().unwrap();
let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
let mut s = Search::new(&txn, &index);
s.query("hello");
let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
}
} }

View File

@ -325,8 +325,6 @@ fn send_and_extract_flattened_documents_data(
// send docid_word_positions_chunk to DB writer // send docid_word_positions_chunk to DB writer
let docid_word_positions_chunk = let docid_word_positions_chunk =
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
let _ = lmdb_writer_sx
.send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
let _ = let _ =
lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));

View File

@ -4,7 +4,6 @@ use std::result::Result as StdResult;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::read_u32_ne_bytes;
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::update::index_documents::transform::Operation; use crate::update::index_documents::transform::Operation;
use crate::Result; use crate::Result;
@ -22,10 +21,6 @@ pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Co
} }
} }
pub fn roaring_bitmap_from_u32s_array(slice: &[u8]) -> RoaringBitmap {
read_u32_ne_bytes(slice).collect()
}
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> { pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
buffer.clear(); buffer.clear();
buffer.reserve(bitmap.serialized_size()); buffer.reserve(bitmap.serialized_size());

View File

@ -14,8 +14,8 @@ pub use grenad_helpers::{
}; };
pub use merge_functions::{ pub use merge_functions::{
concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap,
roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, MergeFn,
}; };
use crate::MAX_WORD_LENGTH; use crate::MAX_WORD_LENGTH;

View File

@ -2471,11 +2471,11 @@ mod tests {
{ {
"id": 3, "id": 3,
"text": "a a a a a a a a a a a a a a a a a "text": "a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a " a a a a a a a a a a a a a a a a a a a a a "
} }
])) ]))
@ -2513,6 +2513,5 @@ mod tests {
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
} }
} }

View File

@ -7,24 +7,19 @@ use std::io;
use charabia::{Language, Script}; use charabia::{Language, Script};
use grenad::MergerBuilder; use grenad::MergerBuilder;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use heed::{BytesDecode, RwTxn}; use heed::RwTxn;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::helpers::{ use super::helpers::{
self, merge_ignore_values, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
valid_lmdb_key, CursorClonableMmap,
}; };
use super::{ClonableMmap, MergeFn}; use super::{ClonableMmap, MergeFn};
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::update::facet::FacetsUpdate; use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::update::index_documents::helpers::as_cloneable_grenad;
use crate::{ use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result};
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
Result,
};
pub(crate) enum TypedChunk { pub(crate) enum TypedChunk {
DocidWordPositions(grenad::Reader<CursorClonableMmap>),
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>), FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>), FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
Documents(grenad::Reader<CursorClonableMmap>), Documents(grenad::Reader<CursorClonableMmap>),
@ -56,29 +51,6 @@ pub(crate) fn write_typed_chunk_into_index(
) -> Result<(RoaringBitmap, bool)> { ) -> Result<(RoaringBitmap, bool)> {
let mut is_merged_database = false; let mut is_merged_database = false;
match typed_chunk { match typed_chunk {
TypedChunk::DocidWordPositions(docid_word_positions_iter) => {
write_entries_into_database(
docid_word_positions_iter,
&index.docid_word_positions,
wtxn,
index_is_empty,
|value, buffer| {
// ensure that values are unique and ordered
let positions = roaring_bitmap_from_u32s_array(value);
BoRoaringBitmapCodec::serialize_into(&positions, buffer);
Ok(buffer)
},
|new_values, db_values, buffer| {
let new_values = roaring_bitmap_from_u32s_array(new_values);
let positions = match BoRoaringBitmapCodec::bytes_decode(db_values) {
Some(db_values) => new_values | db_values,
None => new_values, // should not happen
};
BoRoaringBitmapCodec::serialize_into(&positions, buffer);
Ok(())
},
)?;
}
TypedChunk::Documents(obkv_documents_iter) => { TypedChunk::Documents(obkv_documents_iter) => {
let mut cursor = obkv_documents_iter.into_cursor()?; let mut cursor = obkv_documents_iter.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {