Generalize usage of CboRoaringBitmap codec to ease the use

This commit is contained in:
ManyTheFish 2023-09-25 16:39:32 +02:00
parent 8fb96b8274
commit 9078e60024
11 changed files with 44 additions and 54 deletions

View File

@ -119,16 +119,16 @@ pub struct Index {
pub(crate) main: PolyDatabase,
/// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>,
pub word_docids: Database<Str, CboRoaringBitmapCodec>,
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix.
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,

View File

@ -168,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
@ -182,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> {
&mut self,
word: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
@ -230,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
@ -244,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> {
&mut self,
prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),

View File

@ -495,7 +495,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
fn remove_from_word_prefix_docids(
txn: &mut heed::RwTxn,
db: &Database<Str, RoaringBitmapCodec>,
db: &Database<Str, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap,
) -> Result<fst::Set<Vec<u8>>> {
let mut prefixes_to_delete = fst::SetBuilder::memory();
@ -523,7 +523,7 @@ fn remove_from_word_prefix_docids(
fn remove_from_word_docids(
txn: &mut heed::RwTxn,
db: &heed::Database<Str, RoaringBitmapCodec>,
db: &heed::Database<Str, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap,
words_to_keep: &mut BTreeSet<String>,
words_to_remove: &mut BTreeSet<String>,

View File

@ -106,6 +106,12 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words);
}
if let Some(dictionary) = dictionary {
tokenizer_builder.words_dict(dictionary);
}
if let Some(separators) = allowed_separators {
tokenizer_builder.separators(separators);
}
tokenizer_builder.allow_list(&script_language);
let tokenizer = tokenizer_builder.build();

View File

@ -8,7 +8,7 @@ use obkv::KvReaderU16;
use roaring::RoaringBitmap;
use super::helpers::{
create_sorter, create_writer, merge_roaring_bitmaps, serialize_roaring_bitmap,
create_sorter, create_writer, merge_cbo_roaring_bitmaps, serialize_roaring_bitmap,
sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters,
};
use crate::error::SerializationError;
@ -36,15 +36,12 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
let mut word_fid_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|x| x / 3),
);
let mut current_document_id = None;
let mut fid = 0;
let mut key_buffer = Vec::new();
let mut value_buffer = Vec::new();
let mut words = BTreeSet::new();
@ -55,28 +52,12 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
let (fid_bytes, _) = try_split_array_at(fid_bytes)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);
fid = u16::from_be_bytes(fid_bytes);
let fid = u16::from_be_bytes(fid_bytes);
// drain the btreemaps when we change document.
if current_document_id.map_or(false, |id| id != document_id) {
words_into_sorter(
document_id,
fid,
&mut key_buffer,
&mut value_buffer,
&mut words,
&mut word_fid_docids_sorter,
)?;
}
current_document_id = Some(document_id);
for (_pos, word) in KvReaderU16::new(&value).iter() {
words.insert(word.to_vec());
}
}
// We must make sure that don't lose the current document field id
if let Some(document_id) = current_document_id {
words_into_sorter(
document_id,
fid,
@ -85,11 +66,13 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
&mut words,
&mut word_fid_docids_sorter,
)?;
words.clear();
}
let mut word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
@ -98,7 +81,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
let mut exact_word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
@ -142,15 +125,13 @@ fn words_into_sorter(
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
puffin::profile_function!();
let bitmap = RoaringBitmap::from_iter(Some(document_id));
serialize_roaring_bitmap(&bitmap, value_buffer)?;
for word_bytes in words.iter() {
key_buffer.clear();
key_buffer.extend_from_slice(&word_bytes);
key_buffer.push(0);
key_buffer.extend_from_slice(&fid.to_be_bytes());
word_fid_docids_sorter.insert(&key_buffer, &value_buffer)?;
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
}
words.clear();

View File

@ -56,7 +56,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
}
document_word_positions_into_sorter(
document_id,
current_document_id.unwrap(),
&word_pair_proximity,
&mut word_pair_proximity_docids_sorter,
)?;
@ -64,6 +64,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
word_positions.clear();
}
current_document_id = Some(document_id);
for (position, word) in KvReaderU16::new(&value).iter() {
// drain the proximity window until the head word is considered close to the word we are inserting.
while word_positions.get(0).map_or(false, |(_w, p)| {

View File

@ -35,7 +35,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
);
let mut word_positions: HashSet<(u16, Vec<u8>)> = HashSet::new();
let mut current_document_id = None;
let mut current_document_id: Option<u32> = None;
let mut key_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
@ -49,7 +49,8 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0);
key_buffer.extend_from_slice(&position.to_be_bytes());
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
word_position_docids_sorter
.insert(&key_buffer, current_document_id.unwrap().to_ne_bytes())?;
}
word_positions.clear();
}

View File

@ -181,7 +181,7 @@ pub(crate) fn data_from_obkv_documents(
indexer,
lmdb_writer_sx.clone(),
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
TypedChunk::WordDocids {
word_docids_reader,

View File

@ -38,7 +38,7 @@ use crate::update::{
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
};
use crate::{Index, Result, RoaringBitmapCodec};
use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};
static MERGED_DATABASE_COUNT: usize = 7;
static PREFIX_DATABASE_COUNT: usize = 5;
@ -700,8 +700,8 @@ where
fn execute_word_prefix_docids(
txn: &mut heed::RwTxn,
reader: grenad::Reader<Cursor<ClonableMmap>>,
word_docids_db: Database<Str, RoaringBitmapCodec>,
word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
indexer_config: &IndexerConfig,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],

View File

@ -156,7 +156,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
)?;
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
@ -166,7 +166,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
)?;
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;

View File

@ -5,15 +5,15 @@ use heed::types::{ByteSlice, Str};
use heed::Database;
use crate::update::index_documents::{
create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
CursorClonableMmap, MergeFn,
};
use crate::{Result, RoaringBitmapCodec};
use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
pub struct WordPrefixDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
word_docids: Database<Str, RoaringBitmapCodec>,
word_prefix_docids: Database<Str, RoaringBitmapCodec>,
word_docids: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) max_nb_chunks: Option<usize>,
@ -23,8 +23,8 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
word_docids: Database<Str, RoaringBitmapCodec>,
word_prefix_docids: Database<Str, RoaringBitmapCodec>,
word_docids: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
) -> WordPrefixDocids<'t, 'u, 'i> {
WordPrefixDocids {
wtxn,
@ -51,7 +51,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
// and write into it at the same time, therefore we write into another file.
let mut prefix_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
self.chunk_compression_type,
self.chunk_compression_level,
self.max_nb_chunks,
@ -115,7 +115,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
self.wtxn,
*self.word_prefix_docids.as_polymorph(),
prefix_docids_sorter,
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
)?;
Ok(())