Add FacetsUpdate type that wraps incremental and bulk indexing methods

This commit is contained in:
Loïc Lecrenier 2022-09-05 12:52:05 +02:00 committed by Loïc Lecrenier
parent 3d145d7f48
commit 9b55e582cd
11 changed files with 216 additions and 129 deletions

View File

@ -1,10 +1,11 @@
use std::borrow::Cow;
use std::cmp; use std::cmp;
use std::fs::File; use std::fs::File;
use std::num::NonZeroUsize; use std::num::NonZeroUsize;
use grenad::CompressionType; use grenad::CompressionType;
use heed::types::ByteSlice; use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesEncode, Error, RoTxn}; use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn};
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use time::OffsetDateTime; use time::OffsetDateTime;
@ -14,21 +15,27 @@ use crate::facet::FacetType;
use crate::heed_codec::facet::new::{ use crate::heed_codec::facet::new::{
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
}; };
use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; use crate::update::index_documents::{
use crate::{FieldId, Index, Result}; create_writer, valid_lmdb_key, write_into_lmdb_database, writer_into_reader,
};
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
pub struct FacetsUpdateBulk<'i> { pub struct FacetsUpdateBulk<'i> {
index: &'i Index, index: &'i Index,
database: heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>, database: heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
level_group_size: usize, level_group_size: usize,
min_level_size: usize, min_level_size: usize,
facet_type: FacetType, facet_type: FacetType,
// None if level 0 does not need to be updated
new_data: Option<grenad::Reader<File>>,
} }
impl<'i> FacetsUpdateBulk<'i> { impl<'i> FacetsUpdateBulk<'i> {
pub fn new(index: &'i Index, facet_type: FacetType) -> FacetsUpdateBulk<'i> { pub fn new(
index: &'i Index,
facet_type: FacetType,
new_data: grenad::Reader<File>,
) -> FacetsUpdateBulk<'i> {
FacetsUpdateBulk { FacetsUpdateBulk {
index, index,
database: match facet_type { database: match facet_type {
@ -39,11 +46,31 @@ impl<'i> FacetsUpdateBulk<'i> {
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>() index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
} }
}, },
chunk_compression_type: CompressionType::None,
chunk_compression_level: None,
level_group_size: 4, level_group_size: 4,
min_level_size: 5, min_level_size: 5,
facet_type, facet_type,
new_data: Some(new_data),
}
}
pub fn new_not_updating_level_0(
index: &'i Index,
facet_type: FacetType,
) -> FacetsUpdateBulk<'i> {
FacetsUpdateBulk {
index,
database: match facet_type {
FacetType::String => {
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
}
FacetType::Number => {
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
}
},
level_group_size: 4,
min_level_size: 5,
facet_type,
new_data: None,
} }
} }
@ -70,39 +97,84 @@ impl<'i> FacetsUpdateBulk<'i> {
} }
#[logging_timer::time("FacetsUpdateBulk::{}")] #[logging_timer::time("FacetsUpdateBulk::{}")]
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> {
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
// We get the faceted fields to be able to create the facet levels. // We get the faceted fields to be able to create the facet levels.
let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone();
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
for &field_id in faceted_fields.iter() { for &field_id in faceted_fields.iter() {
self.clear_levels(wtxn, field_id)?; self.clear_levels(wtxn, field_id)?;
} }
self.update_level0(wtxn)?;
let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?; // let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?;
for &field_id in faceted_fields.iter() { for &field_id in faceted_fields.iter() {
let (level_readers, all_docids) = let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?;
self.compute_levels_for_field_id(field_id, &nested_wtxn)?;
let put_docids_fn = match self.facet_type { self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &all_docids)?;
FacetType::Number => Index::put_number_faceted_documents_ids,
FacetType::String => Index::put_string_faceted_documents_ids,
};
put_docids_fn(&self.index, &mut nested_wtxn, field_id, &all_docids)?;
for level_reader in level_readers { for level_reader in level_readers {
// TODO: append instead of write with merge let mut cursor = level_reader.into_cursor()?;
write_into_lmdb_database( while let Some((k, v)) = cursor.move_on_next()? {
&mut nested_wtxn, let key = FacetKeyCodec::<DecodeIgnore>::bytes_decode(k).unwrap();
*self.database.as_polymorph(), let value = FacetGroupValueCodec::bytes_decode(v).unwrap();
level_reader, println!("inserting {key:?} {value:?}");
|_, _| {
Err(InternalError::IndexingMergingKeys { process: "facet string levels" })? self.database.remap_types::<ByteSlice, ByteSlice>().put(wtxn, k, v)?;
}, }
)?; }
}
Ok(())
}
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
let new_data = match self.new_data.take() {
Some(x) => x,
None => return Ok(()),
};
if self.database.is_empty(wtxn)? {
let mut buffer = Vec::new();
let mut database = self.database.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
let mut cursor = new_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if valid_lmdb_key(key) {
buffer.clear();
// the group size for level 0
buffer.push(1);
// then we extend the buffer with the docids bitmap
buffer.extend_from_slice(value);
unsafe { database.append(key, &buffer)? };
}
}
} else {
let mut buffer = Vec::new();
let database = self.database.remap_types::<ByteSlice, ByteSlice>();
let mut cursor = new_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if valid_lmdb_key(key) {
buffer.clear();
// the group size for level 0
buffer.push(1);
// then we extend the buffer with the docids bitmap
match database.get(wtxn, key)? {
Some(prev_value) => {
let old_bitmap = &prev_value[1..];
CboRoaringBitmapCodec::merge_into(
&[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)],
&mut buffer,
)?;
}
None => {
buffer.extend_from_slice(value);
}
};
database.put(wtxn, key, &buffer)?;
}
} }
} }
@ -114,16 +186,14 @@ impl<'i> FacetsUpdateBulk<'i> {
field_id: FieldId, field_id: FieldId,
txn: &RoTxn, txn: &RoTxn,
) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> { ) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> {
let algo = FacetsUpdateBulkAlgorithm { // TODO: first check whether there is anything in level 0
let algo = ComputeHigherLevels {
rtxn: txn, rtxn: txn,
db: &self.database, db: &self.database,
field_id, field_id,
level_group_size: self.level_group_size, level_group_size: self.level_group_size,
min_level_size: self.min_level_size, min_level_size: self.min_level_size,
chunk_compression_type: self.chunk_compression_type,
chunk_compression_level: self.chunk_compression_level,
}; };
// TODO: first check whether there is anything in level 0
let mut all_docids = RoaringBitmap::new(); let mut all_docids = RoaringBitmap::new();
let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| { let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| {
@ -138,16 +208,14 @@ impl<'i> FacetsUpdateBulk<'i> {
} }
} }
pub struct FacetsUpdateBulkAlgorithm<'t> { struct ComputeHigherLevels<'t> {
rtxn: &'t heed::RoTxn<'t>, rtxn: &'t heed::RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>, db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
chunk_compression_type: CompressionType,
chunk_compression_level: Option<u32>,
field_id: u16, field_id: u16,
level_group_size: usize, level_group_size: usize,
min_level_size: usize, min_level_size: usize,
} }
impl<'t> FacetsUpdateBulkAlgorithm<'t> { impl<'t> ComputeHigherLevels<'t> {
fn read_level_0( fn read_level_0(
&self, &self,
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
@ -215,11 +283,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> {
// once we have computed `level_group_size` elements, we give the left bound // once we have computed `level_group_size` elements, we give the left bound
// of those elements, and their bitmaps, to the level above // of those elements, and their bitmaps, to the level above
let mut cur_writer = create_writer( let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
self.chunk_compression_type,
self.chunk_compression_level,
tempfile::tempfile()?,
);
let mut cur_writer_len = 0; let mut cur_writer_len = 0;
let mut group_sizes = vec![]; let mut group_sizes = vec![];
@ -259,7 +323,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> {
Ok(()) Ok(())
})?; })?;
// don't forget to insert the leftover elements into the writer as well // don't forget to insert the leftover elements into the writer as well
if !bitmaps.is_empty() && cur_writer_len >= self.level_group_size * self.min_level_size { if !bitmaps.is_empty() && cur_writer_len >= self.min_level_size {
let left_bound = left_bounds.first().unwrap(); let left_bound = left_bounds.first().unwrap();
handle_group(&bitmaps, left_bound)?; handle_group(&bitmaps, left_bound)?;
for ((bitmap, left_bound), group_size) in for ((bitmap, left_bound), group_size) in
@ -274,7 +338,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> {
cur_writer_len += 1; cur_writer_len += 1;
} }
} }
if cur_writer_len > self.level_group_size * self.min_level_size { if cur_writer_len > self.min_level_size {
sub_writers.push(writer_into_reader(cur_writer)?); sub_writers.push(writer_into_reader(cur_writer)?);
} }
return Ok(sub_writers); return Ok(sub_writers);
@ -315,9 +379,9 @@ mod tests {
documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone());
} }
let documents = documents_batch_reader_from_objects(documents); let documents = documents_batch_reader_from_objects(documents);
dbg!();
index.add_documents(documents).unwrap(); index.add_documents(documents).unwrap();
dbg!();
db_snap!(index, facet_id_f64_docids, name); db_snap!(index, facet_id_f64_docids, name);
}; };

View File

@ -1,2 +1,90 @@
use std::{collections::HashMap, fs::File};
use grenad::CompressionType;
use heed::BytesDecode;
use roaring::RoaringBitmap;
use crate::{
facet::FacetType,
heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice},
CboRoaringBitmapCodec, FieldId, Index, Result,
};
use super::{FacetsUpdateBulk, FacetsUpdateIncremental};
pub mod bulk; pub mod bulk;
pub mod incremental; pub mod incremental;
pub struct FacetsUpdate<'i> {
index: &'i Index,
database: heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
level_group_size: u8,
max_level_group_size: u8,
min_level_size: u8,
facet_type: FacetType,
new_data: grenad::Reader<File>,
}
impl<'i> FacetsUpdate<'i> {
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
let database = match facet_type {
FacetType::String => {
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
}
FacetType::Number => {
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>()
}
};
Self {
index,
database,
level_group_size: 4,
max_level_group_size: 8,
min_level_size: 5,
facet_type,
new_data,
}
}
// /// The number of elements from the level below that are represented by a single element in the level above
// ///
// /// This setting is always greater than or equal to 2.
// pub fn level_group_size(&mut self, value: u8) -> &mut Self {
// self.level_group_size = std::cmp::max(value, 2);
// self
// }
// /// The minimum number of elements that a level is allowed to have.
// pub fn min_level_size(&mut self, value: u8) -> &mut Self {
// self.min_level_size = std::cmp::max(value, 1);
// self
// }
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
if self.database.is_empty(wtxn)? {
let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data);
bulk_update.execute(wtxn)?;
} else {
let indexer = FacetsUpdateIncremental::new(self.database);
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
let mut cursor = self.new_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let key =
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
let docids =
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
}
for (field_id, new_docids) in new_faceted_docids {
let mut docids =
self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
docids |= new_docids;
self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?;
}
}
Ok(())
}
}

View File

@ -1,4 +1,4 @@
--- ---
source: milli/src/update/facet/bulk.rs source: milli/src/update/facet/bulk.rs
--- ---
947949d1a5c9c4e895c89fba46cbba68 07718df52f8463335fb8fefcd3ae01f4

View File

@ -1,4 +1,4 @@
--- ---
source: milli/src/update/facet/bulk.rs source: milli/src/update/facet/bulk.rs
--- ---
947949d1a5c9c4e895c89fba46cbba68 07718df52f8463335fb8fefcd3ae01f4

View File

@ -1,4 +1,4 @@
--- ---
source: milli/src/update/facet/bulk.rs source: milli/src/update/facet/bulk.rs
--- ---
947949d1a5c9c4e895c89fba46cbba68 07718df52f8463335fb8fefcd3ae01f4

View File

@ -1,4 +1,4 @@
--- ---
source: milli/src/update/facet/bulk.rs source: milli/src/update/facet/bulk.rs
--- ---
947949d1a5c9c4e895c89fba46cbba68 07718df52f8463335fb8fefcd3ae01f4

View File

@ -1,4 +1,4 @@
--- ---
source: milli/src/update/facet/bulk.rs source: milli/src/update/facet/bulk.rs
--- ---
947949d1a5c9c4e895c89fba46cbba68 07718df52f8463335fb8fefcd3ae01f4

View File

@ -1,4 +1,4 @@
--- ---
source: milli/src/update/facet/bulk.rs source: milli/src/update/facet/bulk.rs
--- ---
947949d1a5c9c4e895c89fba46cbba68 07718df52f8463335fb8fefcd3ae01f4

View File

@ -1,4 +1,4 @@
--- ---
source: milli/src/update/facet/bulk.rs source: milli/src/update/facet/bulk.rs
--- ---
5ce8009d3eb023e4b9c0a6e7fa4e6262 3e6a91b3c54c614a4787224ac4278ed3

View File

@ -1,4 +1,4 @@
--- ---
source: milli/src/update/facet/bulk.rs source: milli/src/update/facet/bulk.rs
--- ---
5ce8009d3eb023e4b9c0a6e7fa4e6262 3e6a91b3c54c614a4787224ac4278ed3

View File

@ -1,5 +1,4 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap;
use std::convert::TryInto; use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::io; use std::io;
@ -14,12 +13,12 @@ use super::helpers::{
valid_lmdb_key, CursorClonableMmap, valid_lmdb_key, CursorClonableMmap,
}; };
use super::{ClonableMmap, MergeFn}; use super::{ClonableMmap, MergeFn};
use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; use crate::facet::FacetType;
use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::update::index_documents::helpers::as_cloneable_grenad;
use crate::update::FacetsUpdateIncremental;
use crate::{ use crate::{
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
Index, Result, Result,
}; };
pub(crate) enum TypedChunk { pub(crate) enum TypedChunk {
@ -138,78 +137,14 @@ pub(crate) fn write_typed_chunk_into_index(
)?; )?;
is_merged_database = true; is_merged_database = true;
} }
TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
// merge cbo roaring bitmaps is not the correct merger because the data in the DB let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
// is FacetGroupValue and not RoaringBitmap indexer.execute(wtxn)?;
// so I need to create my own merging function
// facet_id_string_docids is encoded as:
// key: FacetKeyCodec<StrRefCodec>
// value: CboRoaringBitmapCodec
// basically
// TODO: a condition saying "if I have more than 1/50th of the DB to add,
// then I do it in bulk, otherwise I do it incrementally". But instead of 1/50,
// it is a ratio I determine empirically
// for now I only do it incrementally, to see if things work
let indexer = FacetsUpdateIncremental::new(
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
);
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
let mut cursor = facet_id_f64_docids_iter.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let key =
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
let docids =
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
}
for (field_id, new_docids) in new_faceted_docids {
let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?;
docids |= new_docids;
index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?;
}
is_merged_database = true; is_merged_database = true;
} }
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => {
// merge cbo roaring bitmaps is not the correct merger because the data in the DB let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter);
// is FacetGroupValue and not RoaringBitmap indexer.execute(wtxn)?;
// so I need to create my own merging function
// facet_id_string_docids is encoded as:
// key: FacetKeyCodec<StrRefCodec>
// value: CboRoaringBitmapCodec
// basically
// TODO: a condition saying "if I have more than 1/50th of the DB to add,
// then I do it in bulk, otherwise I do it incrementally". But instead of 1/50,
// it is a ratio I determine empirically
// for now I only do it incrementally, to see if things work
let indexer = FacetsUpdateIncremental::new(
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
);
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
let mut cursor = facet_id_string_docids.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let key =
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
let docids =
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
}
for (field_id, new_docids) in new_faceted_docids {
let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?;
docids |= new_docids;
index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?;
}
is_merged_database = true; is_merged_database = true;
} }
TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {