From 89ee2cf576858398ee160a0ed54d6494aedcecfc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Mar 2021 17:20:16 +0100 Subject: [PATCH] Introduce the TreeLevel struct --- infos/src/main.rs | 9 ++-- .../heed_codec/str_level_position_codec.rs | 13 +++-- milli/src/lib.rs | 2 + milli/src/tree_level.rs | 47 +++++++++++++++++++ milli/src/update/words_level_positions.rs | 11 +++-- 5 files changed, 67 insertions(+), 15 deletions(-) create mode 100644 milli/src/tree_level.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index 2c11d3783..0e6403d7b 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -5,7 +5,7 @@ use std::{str, io, fmt}; use anyhow::Context; use byte_unit::Byte; use heed::EnvOpenOptions; -use milli::Index; +use milli::{Index, TreeLevel}; use structopt::StructOpt; use Command::*; @@ -561,13 +561,12 @@ fn words_level_positions_docids( for word in words.iter().map(AsRef::as_ref) { let range = { - let left = (word, 0, u32::min_value(), u32::min_value()); - let right = (word, u8::max_value(), u32::max_value(), u32::max_value()); + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); left..=right }; for result in index.word_level_position_docids.range(rtxn, &range)? { let ((w, level, left, right), docids) = result?; - if word != w { break } let count = docids.len().to_string(); let docids = if debug { @@ -575,7 +574,7 @@ fn words_level_positions_docids( } else { format!("{:?}", docids.iter().collect::>()) }; - let position_range = if level == 0 { + let position_range = if level == TreeLevel::min_value() { format!("{:?}", left) } else { format!("{:?}", left..=right) diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs index c421c04b5..810e91940 100644 --- a/milli/src/heed_codec/str_level_position_codec.rs +++ b/milli/src/heed_codec/str_level_position_codec.rs @@ -1,12 +1,14 @@ use std::borrow::Cow; -use std::convert::TryInto; +use std::convert::{TryFrom, TryInto}; use std::mem::size_of; use std::str; +use crate::TreeLevel; + pub struct StrLevelPositionCodec; impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { - type DItem = (&'a str, u8, u32, u32); + type DItem = (&'a str, TreeLevel, u32, u32); fn bytes_decode(bytes: &'a [u8]) -> Option { let footer_len = size_of::() + size_of::() * 2; @@ -19,13 +21,14 @@ impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { let (level, bytes) = bytes.split_first()?; let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?; let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?; + let level = TreeLevel::try_from(*level).ok()?; - Some((word, *level, left, right)) + Some((word, level, left, right)) } } impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { - type EItem = (&'a str, u8, u32, u32); + type EItem = (&'a str, TreeLevel, u32, u32); fn bytes_encode((word, level, left, right): &Self::EItem) -> Option> { let left = left.to_be_bytes(); @@ -33,7 +36,7 @@ impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len()); bytes.extend_from_slice(word.as_bytes()); - bytes.push(*level); + bytes.push((*level).into()); bytes.extend_from_slice(&left[..]); bytes.extend_from_slice(&right[..]); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index de5c6511e..03169bce7 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -9,6 +9,7 @@ pub mod facet; pub mod heed_codec; pub mod index; pub mod proximity; +pub mod tree_level; pub mod update; use std::borrow::Cow; @@ -27,6 +28,7 @@ pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringB pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; +pub use self::tree_level::TreeLevel; pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; diff --git a/milli/src/tree_level.rs b/milli/src/tree_level.rs new file mode 100644 index 000000000..7ce2904e2 --- /dev/null +++ b/milli/src/tree_level.rs @@ -0,0 +1,47 @@ +use std::convert::TryFrom; +use std::fmt; + +/// This is just before the lowest printable character (space, sp, 32) +const MAX_VALUE: u8 = 31; + +#[derive(Debug, Copy, Clone)] +pub enum Error { + LevelTooHigh(u8), +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(transparent)] +pub struct TreeLevel(u8); + +impl TreeLevel { + pub const fn max_value() -> TreeLevel { + TreeLevel(MAX_VALUE) + } + + pub const fn min_value() -> TreeLevel { + TreeLevel(0) + } +} + +impl Into for TreeLevel { + fn into(self) -> u8 { + self.0 + } +} + +impl TryFrom for TreeLevel { + type Error = Error; + + fn try_from(value: u8) -> Result { + match value { + 0..=MAX_VALUE => Ok(TreeLevel(value)), + _ => Err(Error::LevelTooHigh(value)), + } + } +} + +impl fmt::Display for TreeLevel { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index a7be248b6..4286fc780 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -1,4 +1,5 @@ use std::cmp; +use std::convert::TryFrom; use std::fs::File; use std::num::NonZeroUsize; @@ -9,9 +10,9 @@ use log::debug; use roaring::RoaringBitmap; use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; -use crate::Index; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::{Index, TreeLevel}; pub struct WordsLevelPositions<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -105,8 +106,8 @@ fn compute_positions_levels( let (word, ()) = result?; let level_0_range = { - let left = (word, 0, u32::min_value(), u32::min_value()); - let right = (word, 0, u32::max_value(), u32::max_value()); + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); left..=right }; @@ -117,7 +118,7 @@ fn compute_positions_levels( // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) + .map(|l| (TreeLevel::try_from(l).unwrap(), level_group_size.get().pow(l as u32))) .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); // As specified in the documentation, we also write the level 0 entries. @@ -163,7 +164,7 @@ fn compute_positions_levels( fn write_level_entry( writer: &mut Writer, word: &str, - level: u8, + level: TreeLevel, left: u32, right: u32, ids: &RoaringBitmap,