From 84d9c731f889f76079c30187c646b194b5678ef1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 24 Apr 2023 09:59:30 +0200 Subject: [PATCH] Fix bug in encoding of word_position_docids and word_fid_docids --- milli/src/heed_codec/str_beu32_codec.rs | 8 +- milli/src/index.rs | 4 +- milli/src/search/new/db_cache.rs | 32 ++++---- .../search/new/tests/attribute_position.rs | 76 +++++++++++++++++-- .../extract/extract_word_fid_docids.rs | 1 + .../extract/extract_word_position_docids.rs | 1 + 6 files changed, 96 insertions(+), 26 deletions(-) diff --git a/milli/src/heed_codec/str_beu32_codec.rs b/milli/src/heed_codec/str_beu32_codec.rs index 17f3c996f..cce849e37 100644 --- a/milli/src/heed_codec/str_beu32_codec.rs +++ b/milli/src/heed_codec/str_beu32_codec.rs @@ -45,11 +45,12 @@ impl<'a> heed::BytesDecode<'a> for StrBEU16Codec { fn bytes_decode(bytes: &'a [u8]) -> Option { let footer_len = size_of::(); - if bytes.len() < footer_len { + if bytes.len() < footer_len + 1 { return None; } - let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len); + let (_, word) = word_plus_nul_byte.split_last()?; let word = str::from_utf8(word).ok()?; let pos = bytes.try_into().map(u16::from_be_bytes).ok()?; @@ -63,8 +64,9 @@ impl<'a> heed::BytesEncode<'a> for StrBEU16Codec { fn bytes_encode((word, pos): &Self::EItem) -> Option> { let pos = pos.to_be_bytes(); - let mut bytes = Vec::with_capacity(word.len() + pos.len()); + let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len()); bytes.extend_from_slice(word.as_bytes()); + bytes.push(0); bytes.extend_from_slice(&pos[..]); Some(Cow::Owned(bytes)) diff --git a/milli/src/index.rs b/milli/src/index.rs index a36868ef2..bfc75d296 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -126,9 +126,9 @@ pub struct Index { /// Maps the field id and the word count with the docids that corresponds to it. pub field_id_word_count_docids: Database, - /// Maps the position of a word prefix with all the docids where this prefix appears. + /// Maps the word prefix and a position with all the docids where the prefix appears at the position. pub word_prefix_position_docids: Database, - /// Maps the word and the field id with the docids that corresponds to it. + /// Maps the word prefix and a field id with all the docids where the prefix appears inside the field pub word_prefix_fid_docids: Database, /// Maps the script and language with all the docids that corresponds to it. diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index cf851a313..90c604d72 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -261,22 +261,6 @@ impl<'ctx> SearchContext<'ctx> { .transpose() } - pub fn get_db_word_position_docids( - &mut self, - word: Interned, - position: u16, - ) -> Result> { - DatabaseCache::get_value( - self.txn, - (word, position), - &(self.word_interner.get(word).as_str(), position), - &mut self.db_cache.word_position_docids, - self.index.word_position_docids.remap_data_type::(), - )? - .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() - } - pub fn get_db_word_fid_docids( &mut self, word: Interned, @@ -361,6 +345,22 @@ impl<'ctx> SearchContext<'ctx> { Ok(fids) } + pub fn get_db_word_position_docids( + &mut self, + word: Interned, + position: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, position), + &(self.word_interner.get(word).as_str(), position), + &mut self.db_cache.word_position_docids, + self.index.word_position_docids.remap_data_type::(), + )? + .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) + .transpose() + } + pub fn get_db_word_prefix_position_docids( &mut self, word_prefix: Interned, diff --git a/milli/src/search/new/tests/attribute_position.rs b/milli/src/search/new/tests/attribute_position.rs index 0eafedb97..e4ed8b5ff 100644 --- a/milli/src/search/new/tests/attribute_position.rs +++ b/milli/src/search/new/tests/attribute_position.rs @@ -1,4 +1,6 @@ -use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::{ + db_snap, index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy, +}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -6,7 +8,7 @@ fn create_index() -> TempIndex { index .update_settings(|s| { s.set_primary_key("id".to_owned()); - s.set_searchable_fields(vec!["text".to_owned()]); + s.set_searchable_fields(vec!["text".to_owned(), "other".to_owned()]); s.set_criteria(vec![Criterion::Attribute]); }) .unwrap(); @@ -33,20 +35,84 @@ fn create_index() -> TempIndex { "id": 4, "text": "the quick brown fox", }, + { + "id": 5, + "text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + the quick brown fox", + }, + { + "id": 6, + "text": "quick a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + brown", + }, + { + "id": 7, + "text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + quick brown", + }, ])) .unwrap(); index } #[test] -fn test_attribute_fid_simple() { +fn test_attribute_position_simple() { + let index = create_index(); + + db_snap!(index, word_position_docids, @"fe86911166fa4c0903c512fd86ec65e4"); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("quick brown"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0, 6, 7, 5]"); +} +#[test] +fn test_attribute_position_repeated() { let index = create_index(); let txn = index.read_txn().unwrap(); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the quick brown fox"); + s.query("a a a a a"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 7, 6]"); } diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs index 72b30cddf..9ee33ea0d 100644 --- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs @@ -36,6 +36,7 @@ pub fn extract_word_fid_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); let (fid, _) = relative_from_absolute_position(position); key_buffer.extend_from_slice(&fid.to_be_bytes()); word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 80a36c308..9bb43b004 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -39,6 +39,7 @@ pub fn extract_word_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); let (_, position) = relative_from_absolute_position(position); let position = bucketed_position(position); key_buffer.extend_from_slice(&position.to_be_bytes());