diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 983f82657..0a7bc484d 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -3,12 +3,11 @@ use std::fs::File; use std::num::NonZeroUsize; use grenad::{CompressionType, Reader, Writer, FileFuse}; -use heed::types::{ByteSlice, DecodeIgnore}; +use heed::types::DecodeIgnore; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; use crate::Index; use crate::update::index_documents::WriteMethod; @@ -69,12 +68,16 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.min_level_size, )?; + // The previously computed entries also defines the level 0 entries + // so we can clear the database and append all of these entries. + self.index.word_level_position_docids.clear(self.wtxn)?; + write_into_lmdb_database( self.wtxn, *self.index.facet_field_id_value_docids.as_polymorph(), entries, |_, _| anyhow::bail!("invalid facet level merging"), - WriteMethod::GetMergePut, + WriteMethod::Append, )?; Ok(()) @@ -107,77 +110,79 @@ fn compute_positions_levels( min_level_size: NonZeroUsize, ) -> anyhow::Result> { - // let first_level_size = db.prefix_iter(rtxn, &[field_id])? - // .remap_types::() - // .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(compression_type, compression_level, file) + })?; - // // It is forbidden to keep a cursor and write in a database at the same time with LMDB - // // therefore we write the facet levels entries into a grenad file before transfering them. - // let mut writer = tempfile::tempfile().and_then(|file| { - // create_writer(compression_type, compression_level, file) - // })?; + for result in db.iter(rtxn)? { + let ((word, level, left, right), docids) = result?; - // let level_0_range = { - // let left = (field_id, 0, T::min_value(), T::min_value()); - // let right = (field_id, 0, T::max_value(), T::max_value()); - // left..=right - // }; + let first_level_size = db.remap_data_type::() + .prefix_iter(rtxn, &(word, level, u32::min_value(), u32::min_value()))? + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - // // Groups sizes are always a power of the original level_group_size and therefore a group - // // always maps groups of the previous level and never splits previous levels groups in half. - // let group_size_iter = (1u8..) - // .map(|l| (l, level_group_size.get().pow(l as u32))) - // .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + let level_0_range = { + let left = (word, 0, u32::min_value(), u32::min_value()); + let right = (word, 0, u32::max_value(), u32::max_value()); + left..=right + }; - // for (level, group_size) in group_size_iter { - // let mut left = T::zero(); - // let mut right = T::zero(); - // let mut group_docids = RoaringBitmap::new(); + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); - // let db = db.remap_key_type::(); - // for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { - // let ((_field_id, _level, value, _right), docids) = result?; + // As specified in the documentation, we also write the level 0 entries. + write_level_entry(&mut writer, word, level, left, right, &docids)?; - // if i == 0 { - // left = value; - // } else if i % group_size == 0 { - // // we found the first bound of the next group, we must store the left - // // and right bounds associated with the docids. - // write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + for (level, group_size) in group_size_iter { + let mut left = 0; + let mut right = 0; + let mut group_docids = RoaringBitmap::new(); - // // We save the left bound for the new group and also reset the docids. - // group_docids = RoaringBitmap::new(); - // left = value; - // } + for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + let ((_field_id, _level, value, _right), docids) = result?; - // // The right bound is always the bound we run through. - // group_docids.union_with(&docids); - // right = value; - // } + if i == 0 { + left = value; + } else if i % group_size == 0 { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. + write_level_entry(&mut writer, word, level, left, right, &group_docids)?; - // if !group_docids.is_empty() { - // write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; - // } - // } + // We save the left bound for the new group and also reset the docids. + group_docids = RoaringBitmap::new(); + left = value; + } - // writer_into_reader(writer, shrink_size) + // The right bound is always the bound we run through. + group_docids.union_with(&docids); + right = value; + } - todo!() + if !group_docids.is_empty() { + write_level_entry(&mut writer, word, level, left, right, &group_docids)?; + } + } + } + + writer_into_reader(writer, shrink_size) } -fn write_entry( +fn write_level_entry( writer: &mut Writer, - field_id: u8, + word: &str, level: u8, - left: T, - right: T, + left: u32, + right: u32, ids: &RoaringBitmap, ) -> anyhow::Result<()> -where - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { - let key = (field_id, level, left, right); - let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?; + let key = (word, level, left, right); + let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(())