From 1dbbd8694feb66c07cb2eef2144ff785fba16604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 14:01:53 +0200 Subject: [PATCH] Rename StrStrU8Codec to U8StrStrCodec and reorder its fields --- milli/src/heed_codec/mod.rs | 2 +- milli/src/heed_codec/str_str_u8_codec.rs | 28 +++---- milli/src/index.rs | 6 +- milli/src/lib.rs | 2 +- milli/src/search/criteria/mod.rs | 4 +- milli/src/snapshot_tests.rs | 6 +- .../word_prefix_pair_proximity_docids.rs | 74 +++++++++---------- 7 files changed, 61 insertions(+), 61 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index f3691b7d8..e07e47c79 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -15,4 +15,4 @@ pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; pub use self::str_beu32_codec::StrBEU32Codec; -pub use self::str_str_u8_codec::{StrStrU8Codec, UncheckedStrStrU8Codec}; +pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; diff --git a/milli/src/heed_codec/str_str_u8_codec.rs b/milli/src/heed_codec/str_str_u8_codec.rs index 6cfff3ecf..60be8ddc7 100644 --- a/milli/src/heed_codec/str_str_u8_codec.rs +++ b/milli/src/heed_codec/str_str_u8_codec.rs @@ -1,10 +1,10 @@ use std::borrow::Cow; use std::str; -pub struct StrStrU8Codec; +pub struct U8StrStrCodec; -impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { - type DItem = (&'a str, &'a str, u8); +impl<'a> heed::BytesDecode<'a> for U8StrStrCodec { + type DItem = (u8, &'a str, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { let (n, bytes) = bytes.split_first()?; @@ -13,14 +13,14 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { let s2_bytes = &rest[1..]; let s1 = str::from_utf8(s1_bytes).ok()?; let s2 = str::from_utf8(s2_bytes).ok()?; - Some((s1, s2, *n)) + Some((*n, s1, s2)) } } -impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { - type EItem = (&'a str, &'a str, u8); +impl<'a> heed::BytesEncode<'a> for U8StrStrCodec { + type EItem = (u8, &'a str, &'a str); - fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { + fn bytes_encode((n, s1, s2): &Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); bytes.push(*n); bytes.extend_from_slice(s1.as_bytes()); @@ -29,24 +29,24 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { Some(Cow::Owned(bytes)) } } -pub struct UncheckedStrStrU8Codec; +pub struct UncheckedU8StrStrCodec; -impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec { - type DItem = (&'a [u8], &'a [u8], u8); +impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec { + type DItem = (u8, &'a [u8], &'a [u8]); fn bytes_decode(bytes: &'a [u8]) -> Option { let (n, bytes) = bytes.split_first()?; let s1_end = bytes.iter().position(|b| *b == 0)?; let (s1_bytes, rest) = bytes.split_at(s1_end); let s2_bytes = &rest[1..]; - Some((s1_bytes, s2_bytes, *n)) + Some((*n, s1_bytes, s2_bytes)) } } -impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec { - type EItem = (&'a [u8], &'a [u8], u8); +impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec { + type EItem = (u8, &'a [u8], &'a [u8]); - fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { + fn bytes_encode((n, s1, s2): &Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); bytes.push(*n); bytes.extend_from_slice(s1); diff --git a/milli/src/index.rs b/milli/src/index.rs index 0dccabf03..f1bc2fa10 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -21,7 +21,7 @@ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, StrBEU32Codec, StrStrU8Codec, BEU16, BEU32, + Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -106,9 +106,9 @@ pub struct Index { pub docid_word_positions: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. - pub word_pair_proximity_docids: Database, + pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. - pub word_prefix_pair_proximity_docids: Database, + pub word_prefix_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. pub word_position_docids: Database, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 517d28ccc..b5671b33b 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,7 +37,7 @@ pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, - RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, UncheckedStrStrU8Codec, + RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec, }; pub use self::index::Index; pub use self::search::{ diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 866eaefde..86cec1ddc 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -138,7 +138,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { right: &str, proximity: u8, ) -> heed::Result> { - let key = (left, right, proximity); + let key = (proximity, left, right); self.index.word_pair_proximity_docids.get(self.rtxn, &key) } @@ -148,7 +148,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { right: &str, proximity: u8, ) -> heed::Result> { - let key = (left, right, proximity); + let key = (proximity, left, right); self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 17f490758..b4eee7dfe 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -182,16 +182,16 @@ pub fn snap_docid_word_positions(index: &Index) -> String { } pub fn snap_word_pair_proximity_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, word_pair_proximity_docids, |( - (word1, word2, proximity), + (proximity, word1, word2), b, )| { - &format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b)) + &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) }); snap } pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( - (word1, prefix, proximity), + (proximity, word1, prefix), b, )| { &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index f919aecc7..77294296f 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -177,7 +177,7 @@ use log::debug; use crate::update::index_documents::{ create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, }; -use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; +use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedU8StrStrCodec}; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -259,9 +259,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &mut cursor, |cursor| { if let Some((key, value)) = cursor.move_on_next()? { - let (word1, word2, proximity) = UncheckedStrStrU8Codec::bytes_decode(key) + let (proximity, word1, word2) = UncheckedU8StrStrCodec::bytes_decode(key) .ok_or(heed::Error::Decoding)?; - Ok(Some(((word1, word2, proximity), value))) + Ok(Some(((proximity, word1, word2), value))) } else { Ok(None) } @@ -293,7 +293,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { let mut db_iter = self .index .word_pair_proximity_docids - .remap_key_type::() + .remap_key_type::() .remap_data_type::() .iter(self.wtxn)?; @@ -358,7 +358,7 @@ fn execute_on_word_pairs_and_prefixes( mut next_word_pair_proximity: impl for<'a> FnMut( &'a mut I, ) -> Result< - Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, + Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>, >, prefixes: &PrefixTrieNode, max_proximity: u8, @@ -376,14 +376,14 @@ fn execute_on_word_pairs_and_prefixes( let mut prefix_buffer = Vec::with_capacity(8); let mut merge_buffer = Vec::with_capacity(65_536); - while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { // skip this iteration if the proximity is over the threshold if proximity > max_proximity { break; }; let word2_start_different_than_prev = word2[0] != prev_word2_start; // if there were no potential prefixes for the previous word2 based on its first letter, - // and if the current word2 starts with the same letter, then there is also no potential + // and if the current word2 starts with the s`ame letter, then there is also no potential // prefixes for the current word2, and we can skip to the next iteration if empty_prefixes && !word2_start_different_than_prev { continue; @@ -683,7 +683,7 @@ mod tests { use super::*; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; - use crate::{db_snap, CboRoaringBitmapCodec, StrStrU8Codec}; + use crate::{db_snap, CboRoaringBitmapCodec, U8StrStrCodec}; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); @@ -858,40 +858,40 @@ mod tests { CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); let word_pairs = [ - (("healthy", "arbres", 1), &serialised_bitmap123), - (("healthy", "boat", 1), &serialised_bitmap123), - (("healthy", "ca", 1), &serialised_bitmap123), - (("healthy", "cats", 1), &serialised_bitmap456), - (("healthy", "cattos", 1), &serialised_bitmap123), - (("jittery", "cat", 1), &serialised_bitmap123), - (("jittery", "cata", 1), &serialised_bitmap456), - (("jittery", "catb", 1), &serialised_bitmap789), - (("jittery", "catc", 1), &serialised_bitmap_ranges), - (("healthy", "arbre", 2), &serialised_bitmap123), - (("healthy", "arbres", 2), &serialised_bitmap456), - (("healthy", "cats", 2), &serialised_bitmap789), - (("healthy", "cattos", 2), &serialised_bitmap_ranges), - (("healthy", "arbre", 3), &serialised_bitmap456), - (("healthy", "arbres", 3), &serialised_bitmap789), + ((1, "healthy", "arbres"), &serialised_bitmap123), + ((1, "healthy", "boat"), &serialised_bitmap123), + ((1, "healthy", "ca"), &serialised_bitmap123), + ((1, "healthy", "cats"), &serialised_bitmap456), + ((1, "healthy", "cattos"), &serialised_bitmap123), + ((1, "jittery", "cat"), &serialised_bitmap123), + ((1, "jittery", "cata"), &serialised_bitmap456), + ((1, "jittery", "catb"), &serialised_bitmap789), + ((1, "jittery", "catc"), &serialised_bitmap_ranges), + ((2, "healthy", "arbre"), &serialised_bitmap123), + ((2, "healthy", "arbres"), &serialised_bitmap456), + ((2, "healthy", "cats"), &serialised_bitmap789), + ((2, "healthy", "cattos"), &serialised_bitmap_ranges), + ((3, "healthy", "arbre"), &serialised_bitmap456), + ((3, "healthy", "arbres"), &serialised_bitmap789), ]; let expected_result = [ - (("healthy", "arb", 1), bitmap123.clone()), - (("healthy", "arbre", 1), bitmap123.clone()), - (("healthy", "cat", 1), &bitmap456 | &bitmap123), - (("healthy", "catto", 1), bitmap123.clone()), - (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), - (("healthy", "arb", 2), &bitmap123 | &bitmap456), - (("healthy", "arbre", 2), &bitmap123 | &bitmap456), - (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), - (("healthy", "catto", 2), bitmap_ranges.clone()), + ((1, "healthy", "arb"), bitmap123.clone()), + ((1, "healthy", "arbre"), bitmap123.clone()), + ((1, "healthy", "cat"), &bitmap456 | &bitmap123), + ((1, "healthy", "catto"), bitmap123.clone()), + ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), + ((2, "healthy", "arb"), &bitmap123 | &bitmap456), + ((2, "healthy", "arbre"), &bitmap123 | &bitmap456), + ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges), + ((2, "healthy", "catto"), bitmap_ranges.clone()), ]; let mut result = vec![]; let mut iter = - IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { - ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) + IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| { + ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice()) }); execute_on_word_pairs_and_prefixes( &mut iter, @@ -899,7 +899,7 @@ mod tests { &prefixes, 2, |k, v| { - let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); + let (word1, prefix, proximity) = U8StrStrCodec::bytes_decode(k).unwrap(); let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); Ok(()) @@ -908,8 +908,8 @@ mod tests { .unwrap(); for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { - let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x; - let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y; + let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x; + let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y; assert_eq!(actual_word1, expected_word1); assert_eq!(actual_prefix, expected_prefix);