From 5440f43fd3be28981933bd55e126ee88ed3324e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 14:55:02 +0200 Subject: [PATCH] Fix indexing of word_position_docid and fid --- milli/src/snapshot_tests.rs | 8 +++ .../extract/extract_word_fid_docids.rs | 48 ++++++++++++++++ .../extract/extract_word_position_docids.rs | 4 +- .../src/update/index_documents/extract/mod.rs | 17 +++++- milli/src/update/index_documents/mod.rs | 57 +++++++++++++++++++ .../src/update/index_documents/typed_chunk.rs | 12 ++++ 6 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_word_fid_docids.rs diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f7f1a97e6..eb94c4be9 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) }) } +pub fn snap_word_fid_docids(index: &Index) -> String { + make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| { + &format!("{word:<16} {fid:<3} {}", display_bitmap(&b)) + }) +} pub fn snap_field_id_word_count_docids(index: &Index) -> String { make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| { &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) @@ -477,6 +482,9 @@ macro_rules! full_snap_of_db { ($index:ident, word_position_docids) => {{ $crate::snapshot_tests::snap_word_position_docids(&$index) }}; + ($index:ident, word_fid_docids) => {{ + $crate::snapshot_tests::snap_word_fid_docids(&$index) + }}; ($index:ident, field_id_word_count_docids) => {{ $crate::snapshot_tests::snap_field_id_word_count_docids(&$index) }}; diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs new file mode 100644 index 000000000..72b30cddf --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs @@ -0,0 +1,48 @@ +use std::fs::File; +use std::io; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::{relative_from_absolute_position, DocumentId, Result}; + +/// Extracts the word, field id, and the documents ids where this word appear at this field id. +#[logging_timer::time] +pub fn extract_word_fid_docids( + docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_fid_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = DocumentId::from_be_bytes(document_id_bytes); + + for position in read_u32_ne_bytes(value) { + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + let (fid, _) = relative_from_absolute_position(position); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + } + + let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; + + Ok(word_fid_docids_reader) +} diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 734cf8778..80a36c308 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -14,7 +14,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_fid_and_position_docids( +pub fn extract_word_position_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { @@ -39,7 +39,7 @@ pub fn extract_word_fid_and_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - let (_fid, position) = relative_from_absolute_position(position); + let (_, position) = relative_from_absolute_position(position); let position = bucketed_position(position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 844efed36..db041de6f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values; mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_word_docids; +mod extract_word_fid_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; @@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; +use self::extract_word_fid_docids::extract_word_fid_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; -use self::extract_word_position_docids::extract_word_fid_and_position_docids; +use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, @@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>( - docid_word_positions_chunks, + docid_word_positions_chunks.clone(), indexer, lmdb_writer_sx.clone(), - extract_word_fid_and_position_docids, + extract_word_position_docids, merge_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", ); + spawn_extraction_task::<_, _, Vec>>( + docid_word_positions_chunks, + indexer, + lmdb_writer_sx.clone(), + extract_word_fid_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordFidDocids, + "word-fid-docids", + ); spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_strings_chunks, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ade217beb..235b35fc8 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2255,4 +2255,61 @@ mod tests { {"id":1,"catto":"jorts"} "###); } + + #[test] + fn test_word_fid_position() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + {"id": 0, "text": "sun flowers are looking at the sun" }, + {"id": 1, "text": "sun flowers are looking at the sun" }, + {"id": 2, "text": "the sun is shining today" }, + { + "id": 3, + "text": "a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a " + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9"); + db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f"); + + index + .add_documents(documents!([ + {"id": 4, "text": "sun flowers are looking at the sun" }, + {"id": 5, "text2": "sun flowers are looking at the sun" }, + {"id": 6, "text": "b b b" }, + { + "id": 7, + "text2": "a a a a" + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); + db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.strategy(DeletionStrategy::AlwaysHard); + builder.delete_external_id("0"); + builder.delete_external_id("3"); + let result = builder.execute().unwrap(); + println!("{result:?}"); + + wtxn.commit().unwrap(); + + db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); + db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); + db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1"); + } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b9b11cfa8..14ba021bd 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,6 +35,7 @@ pub(crate) enum TypedChunk { exact_word_docids_reader: grenad::Reader, }, WordPositionDocids(grenad::Reader), + WordFidDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), @@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::WordFidDocids(word_fid_docids_iter) => { + append_entries_into_database( + word_fid_docids_iter, + &index.word_fid_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?;