diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index fb79de7b9..20a69d4bc 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::collections::HashMap; use std::fs::File; use std::num::NonZero; @@ -8,7 +7,6 @@ use heed::RoTxn; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use super::SearchableExtractor; use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::{DocumentChange, ItemsPool}; @@ -20,178 +18,6 @@ use crate::{ const MAX_COUNTED_WORDS: usize = 30; -trait ProtoWordDocidsExtractor { - fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>; - fn attributes_to_extract<'a>( - _rtxn: &'a RoTxn, - _index: &'a Index, - ) -> Result>>; - - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; -} - -impl SearchableExtractor for T -where - T: ProtoWordDocidsExtractor, -{ - fn extract_document_change( - rtxn: &RoTxn, - index: &Index, - document_tokenizer: &DocumentTokenizer, - fields_ids_map: &mut GlobalFieldsIdsMap, - cached_sorter: &mut CboCachedSorter, - document_change: DocumentChange, - ) -> Result<()> { - match document_change { - DocumentChange::Deletion(inner) => { - let mut token_fn = |_fname: &str, fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from) - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - } - DocumentChange::Update(inner) => { - let mut token_fn = |_fname: &str, fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from) - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - - let mut token_fn = |_fname: &str, fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from) - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - } - DocumentChange::Insertion(inner) => { - let mut token_fn = |_fname: &str, fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from) - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - } - } - - Ok(()) - } - - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - Self::attributes_to_extract(rtxn, index) - } - - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - Self::attributes_to_skip(rtxn, index) - } -} - -pub struct WordDocidsExtractor; -impl ProtoWordDocidsExtractor for WordDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } - - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - // exact attributes must be skipped and stored in a separate DB, see `ExactWordDocidsExtractor`. - index.exact_attributes(rtxn).map_err(Into::into) - } - - /// TODO write in an external Vec buffer - fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { - Cow::Borrowed(word.as_bytes()) - } -} - -pub struct ExactWordDocidsExtractor; -impl ProtoWordDocidsExtractor for ExactWordDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - let exact_attributes = index.exact_attributes(rtxn)?; - // If there are no user-defined searchable fields, we return all exact attributes. - // Otherwise, we return the intersection of exact attributes and user-defined searchable fields. - if let Some(searchable_attributes) = index.user_defined_searchable_fields(rtxn)? { - let attributes = exact_attributes - .into_iter() - .filter(|attr| searchable_attributes.contains(attr)) - .collect(); - Ok(Some(attributes)) - } else { - Ok(Some(exact_attributes)) - } - } - - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(vec![]) - } - - fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { - Cow::Borrowed(word.as_bytes()) - } -} - -pub struct WordFidDocidsExtractor; -impl ProtoWordDocidsExtractor for WordFidDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } - - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(vec![]) - } - - fn build_key(field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> { - let mut key = Vec::new(); - key.extend_from_slice(word.as_bytes()); - key.push(0); - key.extend_from_slice(&field_id.to_be_bytes()); - Cow::Owned(key) - } -} - -pub struct WordPositionDocidsExtractor; -impl ProtoWordDocidsExtractor for WordPositionDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } - - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(vec![]) - } - - fn build_key(_field_id: FieldId, position: u16, word: &str) -> Cow<[u8]> { - // position must be bucketed to reduce the number of keys in the DB. - let position = bucketed_position(position); - let mut key = Vec::new(); - key.extend_from_slice(word.as_bytes()); - key.push(0); - key.extend_from_slice(&position.to_be_bytes()); - Cow::Owned(key) - } -} - -// V2 - struct WordDocidsCachedSorters { word_fid_docids: CboCachedSorter, word_docids: CboCachedSorter, @@ -340,6 +166,7 @@ impl WordDocidsCachedSorters { buffer.extend_from_slice(&field_id.to_be_bytes()); self.word_fid_docids.insert_del_u32(buffer, docid)?; + let position = bucketed_position(position); buffer.clear(); buffer.extend_from_slice(word.as_bytes()); buffer.push(0); diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 5736fc1d4..7d3655be8 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -2,7 +2,6 @@ use std::collections::VecDeque; use std::rc::Rc; use heed::RoTxn; -use itertools::merge_join_by; use obkv::KvReader; use super::tokenize_document::DocumentTokenizer; diff --git a/milli/src/update/new/indexer/mod.rs b/milli/src/update/new/indexer/mod.rs index 19132b5db..5187e4f4c 100644 --- a/milli/src/update/new/indexer/mod.rs +++ b/milli/src/update/new/indexer/mod.rs @@ -133,42 +133,6 @@ where extractor_sender.send_searchable::(fid_word_count_docids).unwrap(); } - // { - // let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids"); - // let _entered = span.enter(); - // extract_and_send_docids::( - // index, - // &global_fields_ids_map, - // grenad_parameters, - // document_changes.clone(), - // &extractor_sender, - // )?; - // } - - // { - // let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids"); - // let _entered = span.enter(); - // extract_and_send_docids::( - // index, - // &global_fields_ids_map, - // grenad_parameters, - // document_changes.clone(), - // &extractor_sender, - // )?; - // } - - // { - // let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids"); - // let _entered = span.enter(); - // extract_and_send_docids::( - // index, - // &global_fields_ids_map, - // GrenadParameters::default(), - // document_changes.clone(), - // &extractor_sender, - // )?; - // } - { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); let _entered = span.enter();