Generalize usage of CboRoaringBitmap codec to ease the use

This commit is contained in:
ManyTheFish 2023-09-25 16:39:32 +02:00
parent 8fb96b8274
commit 9078e60024
11 changed files with 44 additions and 54 deletions

View File

@ -119,16 +119,16 @@ pub struct Index {
pub(crate) main: PolyDatabase, pub(crate) main: PolyDatabase,
/// A word and all the documents ids containing the word. /// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>, pub word_docids: Database<Str, CboRoaringBitmapCodec>,
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed. /// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
pub exact_word_docids: Database<Str, RoaringBitmapCodec>, pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix. /// A prefix of word and all the documents ids containing this prefix.
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>, pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>, pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears. /// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,

View File

@ -168,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
) )
} }
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
word, word,
self.word_interner.get(word).as_str(), self.word_interner.get(word).as_str(),
@ -182,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> {
&mut self, &mut self,
word: Interned<String>, word: Interned<String>,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
word, word,
self.word_interner.get(word).as_str(), self.word_interner.get(word).as_str(),
@ -230,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
) )
} }
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
prefix, prefix,
self.word_interner.get(prefix).as_str(), self.word_interner.get(prefix).as_str(),
@ -244,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> {
&mut self, &mut self,
prefix: Interned<String>, prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
prefix, prefix,
self.word_interner.get(prefix).as_str(), self.word_interner.get(prefix).as_str(),

View File

@ -495,7 +495,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
fn remove_from_word_prefix_docids( fn remove_from_word_prefix_docids(
txn: &mut heed::RwTxn, txn: &mut heed::RwTxn,
db: &Database<Str, RoaringBitmapCodec>, db: &Database<Str, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap, to_remove: &RoaringBitmap,
) -> Result<fst::Set<Vec<u8>>> { ) -> Result<fst::Set<Vec<u8>>> {
let mut prefixes_to_delete = fst::SetBuilder::memory(); let mut prefixes_to_delete = fst::SetBuilder::memory();
@ -523,7 +523,7 @@ fn remove_from_word_prefix_docids(
fn remove_from_word_docids( fn remove_from_word_docids(
txn: &mut heed::RwTxn, txn: &mut heed::RwTxn,
db: &heed::Database<Str, RoaringBitmapCodec>, db: &heed::Database<Str, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap, to_remove: &RoaringBitmap,
words_to_keep: &mut BTreeSet<String>, words_to_keep: &mut BTreeSet<String>,
words_to_remove: &mut BTreeSet<String>, words_to_remove: &mut BTreeSet<String>,

View File

@ -106,6 +106,12 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
if let Some(stop_words) = stop_words { if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words); tokenizer_builder.stop_words(stop_words);
} }
if let Some(dictionary) = dictionary {
tokenizer_builder.words_dict(dictionary);
}
if let Some(separators) = allowed_separators {
tokenizer_builder.separators(separators);
}
tokenizer_builder.allow_list(&script_language); tokenizer_builder.allow_list(&script_language);
let tokenizer = tokenizer_builder.build(); let tokenizer = tokenizer_builder.build();

View File

@ -8,7 +8,7 @@ use obkv::KvReaderU16;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::helpers::{ use super::helpers::{
create_sorter, create_writer, merge_roaring_bitmaps, serialize_roaring_bitmap, create_sorter, create_writer, merge_cbo_roaring_bitmaps, serialize_roaring_bitmap,
sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters, sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters,
}; };
use crate::error::SerializationError; use crate::error::SerializationError;
@ -36,15 +36,12 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
let mut word_fid_docids_sorter = create_sorter( let mut word_fid_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable, grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
indexer.max_nb_chunks, indexer.max_nb_chunks,
max_memory.map(|x| x / 3), max_memory.map(|x| x / 3),
); );
let mut current_document_id = None;
let mut fid = 0;
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut value_buffer = Vec::new(); let mut value_buffer = Vec::new();
let mut words = BTreeSet::new(); let mut words = BTreeSet::new();
@ -55,28 +52,12 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
let (fid_bytes, _) = try_split_array_at(fid_bytes) let (fid_bytes, _) = try_split_array_at(fid_bytes)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes); let document_id = u32::from_be_bytes(document_id_bytes);
fid = u16::from_be_bytes(fid_bytes); let fid = u16::from_be_bytes(fid_bytes);
// drain the btreemaps when we change document.
if current_document_id.map_or(false, |id| id != document_id) {
words_into_sorter(
document_id,
fid,
&mut key_buffer,
&mut value_buffer,
&mut words,
&mut word_fid_docids_sorter,
)?;
}
current_document_id = Some(document_id);
for (_pos, word) in KvReaderU16::new(&value).iter() { for (_pos, word) in KvReaderU16::new(&value).iter() {
words.insert(word.to_vec()); words.insert(word.to_vec());
} }
}
// We must make sure that don't lose the current document field id
if let Some(document_id) = current_document_id {
words_into_sorter( words_into_sorter(
document_id, document_id,
fid, fid,
@ -85,11 +66,13 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
&mut words, &mut words,
&mut word_fid_docids_sorter, &mut word_fid_docids_sorter,
)?; )?;
words.clear();
} }
let mut word_docids_sorter = create_sorter( let mut word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable, grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
indexer.max_nb_chunks, indexer.max_nb_chunks,
@ -98,7 +81,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
let mut exact_word_docids_sorter = create_sorter( let mut exact_word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable, grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
indexer.max_nb_chunks, indexer.max_nb_chunks,
@ -142,15 +125,13 @@ fn words_into_sorter(
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>, word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> { ) -> Result<()> {
puffin::profile_function!(); puffin::profile_function!();
let bitmap = RoaringBitmap::from_iter(Some(document_id));
serialize_roaring_bitmap(&bitmap, value_buffer)?;
for word_bytes in words.iter() { for word_bytes in words.iter() {
key_buffer.clear(); key_buffer.clear();
key_buffer.extend_from_slice(&word_bytes); key_buffer.extend_from_slice(&word_bytes);
key_buffer.push(0); key_buffer.push(0);
key_buffer.extend_from_slice(&fid.to_be_bytes()); key_buffer.extend_from_slice(&fid.to_be_bytes());
word_fid_docids_sorter.insert(&key_buffer, &value_buffer)?; word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
} }
words.clear(); words.clear();

View File

@ -56,7 +56,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
} }
document_word_positions_into_sorter( document_word_positions_into_sorter(
document_id, current_document_id.unwrap(),
&word_pair_proximity, &word_pair_proximity,
&mut word_pair_proximity_docids_sorter, &mut word_pair_proximity_docids_sorter,
)?; )?;
@ -64,6 +64,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
word_positions.clear(); word_positions.clear();
} }
current_document_id = Some(document_id);
for (position, word) in KvReaderU16::new(&value).iter() { for (position, word) in KvReaderU16::new(&value).iter() {
// drain the proximity window until the head word is considered close to the word we are inserting. // drain the proximity window until the head word is considered close to the word we are inserting.
while word_positions.get(0).map_or(false, |(_w, p)| { while word_positions.get(0).map_or(false, |(_w, p)| {

View File

@ -35,7 +35,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
); );
let mut word_positions: HashSet<(u16, Vec<u8>)> = HashSet::new(); let mut word_positions: HashSet<(u16, Vec<u8>)> = HashSet::new();
let mut current_document_id = None; let mut current_document_id: Option<u32> = None;
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?; let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {
@ -49,7 +49,8 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
key_buffer.extend_from_slice(word_bytes); key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0); key_buffer.push(0);
key_buffer.extend_from_slice(&position.to_be_bytes()); key_buffer.extend_from_slice(&position.to_be_bytes());
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; word_position_docids_sorter
.insert(&key_buffer, current_document_id.unwrap().to_ne_bytes())?;
} }
word_positions.clear(); word_positions.clear();
} }

View File

@ -181,7 +181,7 @@ pub(crate) fn data_from_obkv_documents(
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
TypedChunk::WordDocids { TypedChunk::WordDocids {
word_docids_reader, word_docids_reader,

View File

@ -38,7 +38,7 @@ use crate::update::{
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
}; };
use crate::{Index, Result, RoaringBitmapCodec}; use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};
static MERGED_DATABASE_COUNT: usize = 7; static MERGED_DATABASE_COUNT: usize = 7;
static PREFIX_DATABASE_COUNT: usize = 5; static PREFIX_DATABASE_COUNT: usize = 5;
@ -700,8 +700,8 @@ where
fn execute_word_prefix_docids( fn execute_word_prefix_docids(
txn: &mut heed::RwTxn, txn: &mut heed::RwTxn,
reader: grenad::Reader<Cursor<ClonableMmap>>, reader: grenad::Reader<Cursor<ClonableMmap>>,
word_docids_db: Database<Str, RoaringBitmapCodec>, word_docids_db: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids_db: Database<Str, RoaringBitmapCodec>, word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
indexer_config: &IndexerConfig, indexer_config: &IndexerConfig,
new_prefix_fst_words: &[String], new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]], common_prefix_fst_words: &[&[String]],

View File

@ -156,7 +156,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
|value, _buffer| Ok(value), |value, _buffer| Ok(value),
merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
)?; )?;
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
@ -166,7 +166,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
|value, _buffer| Ok(value), |value, _buffer| Ok(value),
merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
)?; )?;
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;

View File

@ -5,15 +5,15 @@ use heed::types::{ByteSlice, Str};
use heed::Database; use heed::Database;
use crate::update::index_documents::{ use crate::update::index_documents::{
create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
CursorClonableMmap, MergeFn, CursorClonableMmap, MergeFn,
}; };
use crate::{Result, RoaringBitmapCodec}; use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
pub struct WordPrefixDocids<'t, 'u, 'i> { pub struct WordPrefixDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
word_docids: Database<Str, RoaringBitmapCodec>, word_docids: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids: Database<Str, RoaringBitmapCodec>, word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>, pub(crate) chunk_compression_level: Option<u32>,
pub(crate) max_nb_chunks: Option<usize>, pub(crate) max_nb_chunks: Option<usize>,
@ -23,8 +23,8 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
pub fn new( pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
word_docids: Database<Str, RoaringBitmapCodec>, word_docids: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids: Database<Str, RoaringBitmapCodec>, word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
) -> WordPrefixDocids<'t, 'u, 'i> { ) -> WordPrefixDocids<'t, 'u, 'i> {
WordPrefixDocids { WordPrefixDocids {
wtxn, wtxn,
@ -51,7 +51,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
// and write into it at the same time, therefore we write into another file. // and write into it at the same time, therefore we write into another file.
let mut prefix_docids_sorter = create_sorter( let mut prefix_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable, grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
self.chunk_compression_type, self.chunk_compression_type,
self.chunk_compression_level, self.chunk_compression_level,
self.max_nb_chunks, self.max_nb_chunks,
@ -115,7 +115,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
self.wtxn, self.wtxn,
*self.word_prefix_docids.as_polymorph(), *self.word_prefix_docids.as_polymorph(),
prefix_docids_sorter, prefix_docids_sorter,
merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
)?; )?;
Ok(()) Ok(())