From 73ce67862df4f71bffa752292a607488f8a9bac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 5 Sep 2024 10:56:22 +0200 Subject: [PATCH] Use the word pair proximity and fid word count docids extractors Co-authored-by: ManyTheFish --- milli/src/update/new/channel.rs | 58 ++++-- .../new/extract/faceted/facet_document.rs | 1 + milli/src/update/new/extract/faceted/mod.rs | 1 + milli/src/update/new/extract/mod.rs | 12 +- .../extract_fid_word_count_docids.rs | 34 ++-- .../extract/searchable/extract_word_docids.rs | 8 +- .../extract_word_pair_proximity_docids.rs | 29 ++- milli/src/update/new/indexer/mod.rs | 169 ++++++++++-------- milli/src/update/new/merger.rs | 50 ++++-- 9 files changed, 205 insertions(+), 157 deletions(-) diff --git a/milli/src/update/new/channel.rs b/milli/src/update/new/channel.rs index e9a795bf5..3eafb7754 100644 --- a/milli/src/update/new/channel.rs +++ b/milli/src/update/new/channel.rs @@ -112,23 +112,27 @@ pub struct WriterOperation { } pub enum Database { - WordDocids, - ExactWordDocids, - WordFidDocids, - WordPositionDocids, Documents, + ExactWordDocids, + FidWordCountDocids, Main, + WordDocids, + WordFidDocids, + WordPairProximityDocids, + WordPositionDocids, } impl WriterOperation { pub fn database(&self, index: &Index) -> heed::Database { match self.database { - Database::Main => index.main.remap_types(), Database::Documents => index.documents.remap_types(), - Database::WordDocids => index.word_docids.remap_types(), Database::ExactWordDocids => index.exact_word_docids.remap_types(), + Database::Main => index.main.remap_types(), + Database::WordDocids => index.word_docids.remap_types(), Database::WordFidDocids => index.word_fid_docids.remap_types(), Database::WordPositionDocids => index.word_position_docids.remap_types(), + Database::FidWordCountDocids => index.field_id_word_count_docids.remap_types(), + Database::WordPairProximityDocids => index.word_pair_proximity_docids.remap_types(), } } @@ -198,9 +202,11 @@ impl MainSender<'_> { } } -pub enum WordDocids {} pub enum ExactWordDocids {} +pub enum FidWordCountDocids {} +pub enum WordDocids {} pub enum WordFidDocids {} +pub enum WordPairProximityDocids {} pub enum WordPositionDocids {} pub trait DatabaseType { @@ -209,14 +215,6 @@ pub trait DatabaseType { fn new_merger_operation(merger: Merger) -> MergerOperation; } -impl DatabaseType for WordDocids { - const DATABASE: Database = Database::WordDocids; - - fn new_merger_operation(merger: Merger) -> MergerOperation { - MergerOperation::WordDocidsMerger(merger) - } -} - impl DatabaseType for ExactWordDocids { const DATABASE: Database = Database::ExactWordDocids; @@ -225,6 +223,22 @@ impl DatabaseType for ExactWordDocids { } } +impl DatabaseType for FidWordCountDocids { + const DATABASE: Database = Database::FidWordCountDocids; + + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::FidWordCountDocidsMerger(merger) + } +} + +impl DatabaseType for WordDocids { + const DATABASE: Database = Database::WordDocids; + + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::WordDocidsMerger(merger) + } +} + impl DatabaseType for WordFidDocids { const DATABASE: Database = Database::WordFidDocids; @@ -233,6 +247,14 @@ impl DatabaseType for WordFidDocids { } } +impl DatabaseType for WordPairProximityDocids { + const DATABASE: Database = Database::WordPairProximityDocids; + + fn new_merger_operation(merger: Merger) -> MergerOperation { + MergerOperation::WordPairProximityDocidsMerger(merger) + } +} + impl DatabaseType for WordPositionDocids { const DATABASE: Database = Database::WordPositionDocids; @@ -293,12 +315,14 @@ impl DocumentsSender<'_> { } pub enum MergerOperation { - WordDocidsMerger(Merger), ExactWordDocidsMerger(Merger), + FidWordCountDocidsMerger(Merger), + WordDocidsMerger(Merger), WordFidDocidsMerger(Merger), + WordPairProximityDocidsMerger(Merger), WordPositionDocidsMerger(Merger), - InsertDocument { docid: DocumentId, document: Box }, DeleteDocument { docid: DocumentId }, + InsertDocument { docid: DocumentId, document: Box }, } pub struct MergerReceiver(Receiver); diff --git a/milli/src/update/new/extract/faceted/facet_document.rs b/milli/src/update/new/extract/faceted/facet_document.rs index 849fa8f29..4525e866f 100644 --- a/milli/src/update/new/extract/faceted/facet_document.rs +++ b/milli/src/update/new/extract/faceted/facet_document.rs @@ -1,5 +1,6 @@ use serde_json::Value; +use crate::update::new::extract::perm_json_p; use crate::update::new::KvReaderFieldId; use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index 19aeb031c..bd58c21b4 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -2,6 +2,7 @@ use std::collections::HashSet; use std::fmt::Debug; use std::fs::File; +pub use extract_facets::*; use grenad::{MergeFunction, Merger}; use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelIterator}; diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index e50e70c47..d6d5a3005 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -2,16 +2,8 @@ mod cache; mod faceted; mod searchable; -pub use faceted::modname::{ - FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor, - FieldIdFacetIsNullDocidsExtractor, FieldIdFacetNumberDocidsExtractor, - FieldIdFacetStringDocidsExtractor, -}; -pub use faceted::FacetedExtractor; -pub use searchable::{ - ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, - WordPositionDocidsExtractor, -}; +pub use faceted::*; +pub use searchable::*; /// TODO move in permissive json pointer pub mod perm_json_p { diff --git a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs index 08160155e..7cb11c11b 100644 --- a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs @@ -1,15 +1,14 @@ -use std::{borrow::Cow, collections::HashMap}; +use std::borrow::Cow; +use std::collections::HashMap; use heed::RoTxn; -use super::{tokenize_document::DocumentTokenizer, SearchableExtractor}; -use crate::{ - update::{ - new::{extract::cache::CboCachedSorter, DocumentChange}, - MergeDeladdCboRoaringBitmaps, - }, - FieldId, GlobalFieldsIdsMap, Index, Result, -}; +use super::tokenize_document::DocumentTokenizer; +use super::SearchableExtractor; +use crate::update::new::extract::cache::CboCachedSorter; +use crate::update::new::DocumentChange; +use crate::update::MergeDeladdCboRoaringBitmaps; +use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; const MAX_COUNTED_WORDS: usize = 30; @@ -22,12 +21,13 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { index.user_defined_searchable_fields(rtxn).map_err(Into::into) } - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { Ok(vec![]) } /// This case is unreachable because extract_document_change has been reimplemented to not call this function. - fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> { + fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> { + /// TODO remove this unreachable!() } @@ -45,7 +45,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { match document_change { DocumentChange::Deletion(inner) => { let mut fid_word_count = HashMap::new(); - let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1); Ok(()) }; @@ -66,10 +66,10 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { } DocumentChange::Update(inner) => { let mut fid_word_count = HashMap::new(); - let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { fid_word_count .entry(fid) - .and_modify(|(current_count, new_count)| *current_count += 1) + .and_modify(|(current_count, _new_count)| *current_count += 1) .or_insert((1, 0)); Ok(()) }; @@ -79,10 +79,10 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { &mut token_fn, )?; - let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { fid_word_count .entry(fid) - .and_modify(|(current_count, new_count)| *new_count += 1) + .and_modify(|(_current_count, new_count)| *new_count += 1) .or_insert((0, 1)); Ok(()) }; @@ -106,7 +106,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { } DocumentChange::Insertion(inner) => { let mut fid_word_count = HashMap::new(); - let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| { fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1); Ok(()) }; diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 70f9c4e47..db8bb7993 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -20,7 +20,7 @@ impl SearchableExtractor for WordDocidsExtractor { } /// TODO write in an external Vec buffer - fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { + fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { Cow::Borrowed(word.as_bytes()) } } @@ -49,7 +49,7 @@ impl SearchableExtractor for ExactWordDocidsExtractor { Ok(vec![]) } - fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { + fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { Cow::Borrowed(word.as_bytes()) } } @@ -67,7 +67,7 @@ impl SearchableExtractor for WordFidDocidsExtractor { Ok(vec![]) } - fn build_key<'a>(field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { + fn build_key(field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { let mut key = Vec::new(); key.extend_from_slice(word.as_bytes()); key.push(0); @@ -89,7 +89,7 @@ impl SearchableExtractor for WordPositionDocidsExtractor { Ok(vec![]) } - fn build_key<'a>(_field_id: FieldId, position: u16, word: &'a str) -> Cow<'a, [u8]> { + fn build_key(_field_id: FieldId, position: u16, word: &str) -> Cow<[u8]> { // position must be bucketed to reduce the number of keys in the DB. let position = bucketed_position(position); let mut key = Vec::new(); diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index e170a6486..e9de6e9f2 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -1,21 +1,17 @@ -use std::{ - borrow::Cow, - collections::{BTreeMap, VecDeque}, -}; +use std::borrow::Cow; +use std::collections::{BTreeMap, VecDeque}; use heed::RoTxn; use itertools::merge_join_by; use obkv::KvReader; -use super::{tokenize_document::DocumentTokenizer, SearchableExtractor}; -use crate::{ - proximity::{index_proximity, MAX_DISTANCE}, - update::{ - new::{extract::cache::CboCachedSorter, DocumentChange}, - MergeDeladdCboRoaringBitmaps, - }, - FieldId, GlobalFieldsIdsMap, Index, Result, -}; +use super::tokenize_document::DocumentTokenizer; +use super::SearchableExtractor; +use crate::proximity::{index_proximity, MAX_DISTANCE}; +use crate::update::new::extract::cache::CboCachedSorter; +use crate::update::new::DocumentChange; +use crate::update::MergeDeladdCboRoaringBitmaps; +use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; pub struct WordPairProximityDocidsExtractor; impl SearchableExtractor for WordPairProximityDocidsExtractor { @@ -26,12 +22,13 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { index.user_defined_searchable_fields(rtxn).map_err(Into::into) } - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { Ok(vec![]) } /// This case is unreachable because extract_document_change has been reimplemented to not call this function. - fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> { + fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> { + /// TODO remove this unreachable!() } @@ -159,7 +156,7 @@ fn process_document_tokens( word_positions: &mut VecDeque<(String, u16)>, word_pair_proximity: &mut BTreeMap<(String, String), u8>, ) -> Result<()> { - let mut token_fn = |fid: FieldId, pos: u16, word: &str| { + let mut token_fn = |_fid: FieldId, pos: u16, word: &str| { // drain the proximity window until the head word is considered close to the word we are inserting. while word_positions .front() diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index ed42c03b1..d721a5511 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -11,15 +11,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::ThreadPool; pub use update_by_function::UpdateByFunction; -use super::channel::{ - extractors_merger_channels, merger_writer_channel, EntryOperation, ExactWordDocids, WordDocids, - WordFidDocids, WordPositionDocids, -}; +use super::channel::*; use super::document_change::DocumentChange; -use super::extract::{ - ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, - WordPositionDocidsExtractor, -}; +use super::extract::*; use super::merger::merge_grenad_entries; use super::StdResult; use crate::documents::{ @@ -71,79 +65,98 @@ where // TODO manage the errors correctly let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.in_place_scope(|_s| { - let document_changes = document_changes.into_par_iter(); + let document_changes = document_changes.into_par_iter(); - // document but we need to create a function that collects and compresses documents. - document_changes.clone().into_par_iter().try_for_each(|result| { - match result? { - DocumentChange::Deletion(deletion) => { - let docid = deletion.docid(); - extractor_sender.document_delete(docid).unwrap(); + // document but we need to create a function that collects and compresses documents. + document_changes.clone().into_par_iter().try_for_each(|result| { + match result? { + DocumentChange::Deletion(deletion) => { + let docid = deletion.docid(); + extractor_sender.document_delete(docid).unwrap(); + } + DocumentChange::Update(update) => { + let docid = update.docid(); + let content = update.new(); + extractor_sender.document_insert(docid, content.boxed()).unwrap(); + } + DocumentChange::Insertion(insertion) => { + let docid = insertion.docid(); + let content = insertion.new(); + extractor_sender.document_insert(docid, content.boxed()).unwrap(); + // extracted_dictionary_sender.send(self, dictionary: &[u8]); + } } - DocumentChange::Update(update) => { - let docid = update.docid(); - let content = update.new(); - extractor_sender.document_insert(docid, content.boxed()).unwrap(); - } - DocumentChange::Insertion(insertion) => { - let docid = insertion.docid(); - let content = insertion.new(); - extractor_sender.document_insert(docid, content.boxed()).unwrap(); - // extracted_dictionary_sender.send(self, dictionary: &[u8]); - } - } + Ok(()) as Result<_> + })?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + extract_and_send_docids::< + WordPairProximityDocidsExtractor, + WordPairProximityDocids, + >( + index, + &global_fields_ids_map, + GrenadParameters::default(), + document_changes.clone(), + &extractor_sender, + )?; + + // TODO THIS IS TOO MUCH + // Extract fieldid docid facet number + // Extract fieldid docid facet string + // Extract facetid string fst + // Extract facetid normalized string strings + + // TODO Inverted Indexes again + // Extract fieldid facet isempty docids + // Extract fieldid facet isnull docids + // Extract fieldid facet exists docids + + // TODO This is the normal system + // Extract fieldid facet number docids + // Extract fieldid facet string docids + Ok(()) as Result<_> - })?; - - extract_and_send_docids::( - index, - &global_fields_ids_map, - GrenadParameters::default(), - document_changes.clone(), - &extractor_sender, - )?; - - extract_and_send_docids::( - index, - &global_fields_ids_map, - GrenadParameters::default(), - document_changes.clone(), - &extractor_sender, - )?; - - extract_and_send_docids::( - index, - &global_fields_ids_map, - GrenadParameters::default(), - document_changes.clone(), - &extractor_sender, - )?; - - extract_and_send_docids::( - index, - &global_fields_ids_map, - GrenadParameters::default(), - document_changes.clone(), - &extractor_sender, - )?; - - // TODO THIS IS TOO MUCH - // Extract fieldid docid facet number - // Extract fieldid docid facet string - // Extract facetid string fst - // Extract facetid normalized string strings - - // TODO Inverted Indexes again - // Extract fieldid facet isempty docids - // Extract fieldid facet isnull docids - // Extract fieldid facet exists docids - - // TODO This is the normal system - // Extract fieldid facet number docids - // Extract fieldid facet string docids - - Ok(()) as Result<_> - }) + }) })?; // TODO manage the errors correctly diff --git a/milli/src/update/new/merger.rs b/milli/src/update/new/merger.rs index 25f09441c..19f56a301 100644 --- a/milli/src/update/new/merger.rs +++ b/milli/src/update/new/merger.rs @@ -8,10 +8,7 @@ use memmap2::Mmap; use roaring::RoaringBitmap; use tempfile::tempfile; -use super::channel::{ - DatabaseType, DocidsSender, ExactWordDocids, MergerReceiver, MergerSender, WordDocids, - WordFidDocids, WordPositionDocids, -}; +use super::channel::*; use super::KvReaderDelAdd; use crate::update::del_add::DelAdd; use crate::update::new::channel::MergerOperation; @@ -30,6 +27,29 @@ pub fn merge_grenad_entries( for merger_operation in receiver { match merger_operation { + MergerOperation::ExactWordDocidsMerger(merger) => { + merge_and_send_docids( + merger, + /// TODO do a MergerOperation::database(&Index) -> Database. + index.exact_word_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |_key| Ok(()), + |_key| Ok(()), + )?; + } + MergerOperation::FidWordCountDocidsMerger(merger) => { + merge_and_send_docids( + merger, + index.field_id_word_count_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |_key| Ok(()), + |_key| Ok(()), + )?; + } MergerOperation::WordDocidsMerger(merger) => { let mut add_words_fst = SetBuilder::new(tempfile()?)?; let mut del_words_fst = SetBuilder::new(tempfile()?)?; @@ -49,17 +69,6 @@ pub fn merge_grenad_entries( let mmap = compute_new_words_fst(add_words_fst, del_words_fst, words_fst)?; sender.main().write_words_fst(mmap).unwrap(); } - MergerOperation::ExactWordDocidsMerger(merger) => { - merge_and_send_docids( - merger, - index.exact_word_docids.remap_types(), - rtxn, - &mut buffer, - sender.docids::(), - |_key| Ok(()), - |_key| Ok(()), - )?; - } MergerOperation::WordFidDocidsMerger(merger) => { merge_and_send_docids( merger, @@ -71,6 +80,17 @@ pub fn merge_grenad_entries( |_key| Ok(()), )?; } + MergerOperation::WordPairProximityDocidsMerger(merger) => { + merge_and_send_docids( + merger, + index.word_pair_proximity_docids.remap_types(), + rtxn, + &mut buffer, + sender.docids::(), + |_key| Ok(()), + |_key| Ok(()), + )?; + } MergerOperation::WordPositionDocidsMerger(merger) => { merge_and_send_docids( merger,