diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 49259cd64..444c3f7d5 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -5,8 +5,8 @@ use std::ops::DerefMut as _; use bumpalo::collections::vec::Vec as BumpVec; use bumpalo::Bump; -use heed::RoTxn; +use super::match_searchable_field; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; @@ -17,8 +17,7 @@ use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; -use crate::update::GrenadParameters; -use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{bucketed_position, DocumentId, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; const MAX_COUNTED_WORDS: usize = 30; @@ -207,9 +206,10 @@ impl<'extractor> WordDocidsCaches<'extractor> { } pub struct WordDocidsExtractorData<'a> { - tokenizer: &'a DocumentTokenizer<'a>, - grenad_parameters: &'a GrenadParameters, + tokenizer: DocumentTokenizer<'a>, + max_memory_by_thread: Option, buckets: usize, + searchable_attributes: Option>, } impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { @@ -218,7 +218,7 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in( self.buckets, - self.grenad_parameters.max_memory_by_thread(), + self.max_memory_by_thread, extractor_alloc, )))) } @@ -230,7 +230,12 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { ) -> Result<()> { for change in changes { let change = change?; - WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?; + WordDocidsExtractors::extract_document_change( + context, + &self.tokenizer, + self.searchable_attributes.as_deref(), + change, + )?; } Ok(()) } @@ -248,52 +253,42 @@ impl WordDocidsExtractors { where MSP: Fn() -> bool + Sync, { - let index = indexing_context.index; - let rtxn = index.read_txn()?; - - let stop_words = index.stop_words(&rtxn)?; - let allowed_separators = index.allowed_separators(&rtxn)?; + // Warning: this is duplicated code from extract_word_pair_proximity_docids.rs + let rtxn = indexing_context.index.read_txn()?; + let stop_words = indexing_context.index.stop_words(&rtxn)?; + let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; let allowed_separators: Option> = allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let dictionary = index.dictionary(&rtxn)?; + let dictionary = indexing_context.index.dictionary(&rtxn)?; let dictionary: Option> = dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let builder = tokenizer_builder( + let mut builder = tokenizer_builder( stop_words.as_ref(), allowed_separators.as_deref(), dictionary.as_deref(), ); - let tokenizer = builder.into_tokenizer(); - - let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; - let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?; + let tokenizer = builder.build(); let localized_attributes_rules = - index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - + indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); let document_tokenizer = DocumentTokenizer { tokenizer: &tokenizer, - attribute_to_extract: attributes_to_extract.as_deref(), - attribute_to_skip: attributes_to_skip.as_slice(), localized_attributes_rules: &localized_attributes_rules, max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - + let extractor_data = WordDocidsExtractorData { + tokenizer: document_tokenizer, + max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(), + buckets: rayon::current_num_threads(), + searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?, + }; let datastore = ThreadLocal::new(); - { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - - let extractor = WordDocidsExtractorData { - tokenizer: &document_tokenizer, - grenad_parameters: indexing_context.grenad_parameters, - buckets: rayon::current_num_threads(), - }; - extract( document_changes, - &extractor, + &extractor_data, indexing_context, extractor_allocs, &datastore, @@ -312,6 +307,7 @@ impl WordDocidsExtractors { fn extract_document_change( context: &DocumentChangeContext>>, document_tokenizer: &DocumentTokenizer, + searchable_attributes: Option<&[&str]>, document_change: DocumentChange, ) -> Result<()> { let index = &context.index; @@ -345,7 +341,9 @@ impl WordDocidsExtractors { } DocumentChange::Update(inner) => { if !inner.has_changed_for_fields( - document_tokenizer.attribute_to_extract, + &mut |field_name: &str| { + match_searchable_field(field_name, searchable_attributes) + }, &context.rtxn, context.index, context.db_fields_ids_map, @@ -408,15 +406,4 @@ impl WordDocidsExtractors { let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc); cached_sorter.flush_fid_word_count(&mut buffer) } - - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } - - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(Vec::new()) - } } diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index e58c0efd2..0724b0513 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -2,30 +2,114 @@ use std::cell::RefCell; use std::collections::VecDeque; use std::rc::Rc; -use heed::RoTxn; +use bumpalo::Bump; -use super::tokenize_document::DocumentTokenizer; -use super::SearchableExtractor; +use super::match_searchable_field; +use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::update::new::document::Document; use crate::update::new::extract::cache::BalancedCaches; -use crate::update::new::indexer::document_changes::DocumentChangeContext; +use crate::update::new::indexer::document_changes::{ + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, +}; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::steps::IndexingStep; +use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; -use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; +use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE}; + +pub struct WordPairProximityDocidsExtractorData<'a> { + tokenizer: DocumentTokenizer<'a>, + searchable_attributes: Option>, + max_memory_by_thread: Option, + buckets: usize, +} + +impl<'a, 'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractorData<'a> { + type Data = RefCell>; + + fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { + Ok(RefCell::new(BalancedCaches::new_in( + self.buckets, + self.max_memory_by_thread, + extractor_alloc, + ))) + } + + fn process<'doc>( + &self, + changes: impl Iterator>>, + context: &DocumentChangeContext, + ) -> Result<()> { + for change in changes { + let change = change?; + WordPairProximityDocidsExtractor::extract_document_change( + context, + &self.tokenizer, + self.searchable_attributes.as_deref(), + change, + )?; + } + Ok(()) + } +} pub struct WordPairProximityDocidsExtractor; -impl SearchableExtractor for WordPairProximityDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } +impl WordPairProximityDocidsExtractor { + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( + document_changes: &DC, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, + extractor_allocs: &'extractor mut ThreadLocal>, + step: IndexingStep, + ) -> Result>> + where + MSP: Fn() -> bool + Sync, + { + // Warning: this is duplicated code from extract_word_docids.rs + let rtxn = indexing_context.index.read_txn()?; + let stop_words = indexing_context.index.stop_words(&rtxn)?; + let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; + let allowed_separators: Option> = + allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let dictionary = indexing_context.index.dictionary(&rtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let mut builder = tokenizer_builder( + stop_words.as_ref(), + allowed_separators.as_deref(), + dictionary.as_deref(), + ); + let tokenizer = builder.build(); + let localized_attributes_rules = + indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); + let document_tokenizer = DocumentTokenizer { + tokenizer: &tokenizer, + localized_attributes_rules: &localized_attributes_rules, + max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, + }; + let extractor_data = WordPairProximityDocidsExtractorData { + tokenizer: document_tokenizer, + searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?, + max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(), + buckets: rayon::current_num_threads(), + }; + let datastore = ThreadLocal::new(); + { + let span = + tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); + let _entered = span.enter(); + extract( + document_changes, + &extractor_data, + indexing_context, + extractor_allocs, + &datastore, + step, + )?; + } - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(Vec::new()) + Ok(datastore.into_iter().map(RefCell::into_inner).collect()) } // This method is reimplemented to count the number of words in the document in each field @@ -34,6 +118,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { fn extract_document_change( context: &DocumentChangeContext>, document_tokenizer: &DocumentTokenizer, + searchable_attributes: Option<&[&str]>, document_change: DocumentChange, ) -> Result<()> { let doc_alloc = &context.doc_alloc; @@ -71,7 +156,9 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { } DocumentChange::Update(inner) => { if !inner.has_changed_for_fields( - document_tokenizer.attribute_to_extract, + &mut |field_name: &str| { + match_searchable_field(field_name, searchable_attributes) + }, rtxn, index, context.db_fields_ids_map, diff --git a/crates/milli/src/update/new/extract/searchable/mod.rs b/crates/milli/src/update/new/extract/searchable/mod.rs index 7c949a3ce..79a6fae87 100644 --- a/crates/milli/src/update/new/extract/searchable/mod.rs +++ b/crates/milli/src/update/new/extract/searchable/mod.rs @@ -2,145 +2,28 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod tokenize_document; -use std::cell::RefCell; -use std::marker::PhantomData; - -use bumpalo::Bump; pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors}; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; -use heed::RoTxn; -use tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use super::cache::BalancedCaches; -use super::DocidsExtractor; -use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, -}; -use crate::update::new::steps::IndexingStep; -use crate::update::new::thread_local::{FullySend, ThreadLocal}; -use crate::update::new::DocumentChange; -use crate::update::GrenadParameters; -use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::attribute_patterns::{match_field_legacy, PatternMatch}; -pub struct SearchableExtractorData<'a, EX: SearchableExtractor> { - tokenizer: &'a DocumentTokenizer<'a>, - grenad_parameters: &'a GrenadParameters, - buckets: usize, - _ex: PhantomData, -} +pub fn match_searchable_field( + field_name: &str, + searchable_fields: Option<&[&str]>, +) -> PatternMatch { + let Some(searchable_fields) = searchable_fields else { + // If no searchable fields are provided, consider all fields as searchable + return PatternMatch::Match; + }; -impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> - for SearchableExtractorData<'a, EX> -{ - type Data = RefCell>; - - fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { - Ok(RefCell::new(BalancedCaches::new_in( - self.buckets, - self.grenad_parameters.max_memory_by_thread(), - extractor_alloc, - ))) - } - - fn process<'doc>( - &self, - changes: impl Iterator>>, - context: &DocumentChangeContext, - ) -> Result<()> { - for change in changes { - let change = change?; - EX::extract_document_change(context, self.tokenizer, change)?; + let mut selection = PatternMatch::NoMatch; + for pattern in searchable_fields { + match match_field_legacy(pattern, field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), } - Ok(()) - } -} - -pub trait SearchableExtractor: Sized + Sync { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: IndexingStep, - ) -> Result>> - where - MSP: Fn() -> bool + Sync, - { - let rtxn = indexing_context.index.read_txn()?; - let stop_words = indexing_context.index.stop_words(&rtxn)?; - let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; - let allowed_separators: Option> = - allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let dictionary = indexing_context.index.dictionary(&rtxn)?; - let dictionary: Option> = - dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let mut builder = tokenizer_builder( - stop_words.as_ref(), - allowed_separators.as_deref(), - dictionary.as_deref(), - ); - let tokenizer = builder.build(); - - let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?; - let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?; - let localized_attributes_rules = - indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - - let document_tokenizer = DocumentTokenizer { - tokenizer: &tokenizer, - attribute_to_extract: attributes_to_extract.as_deref(), - attribute_to_skip: attributes_to_skip.as_slice(), - localized_attributes_rules: &localized_attributes_rules, - max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, - }; - - let extractor_data: SearchableExtractorData = SearchableExtractorData { - tokenizer: &document_tokenizer, - grenad_parameters: indexing_context.grenad_parameters, - buckets: rayon::current_num_threads(), - _ex: PhantomData, - }; - - let datastore = ThreadLocal::new(); - - { - let span = - tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); - let _entered = span.enter(); - extract( - document_changes, - &extractor_data, - indexing_context, - extractor_allocs, - &datastore, - step, - )?; - } - - Ok(datastore.into_iter().map(RefCell::into_inner).collect()) } - fn extract_document_change( - context: &DocumentChangeContext>, - document_tokenizer: &DocumentTokenizer, - document_change: DocumentChange, - ) -> Result<()>; - - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) - -> Result>>; - - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; -} - -impl DocidsExtractor for T { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: IndexingStep, - ) -> Result>> - where - MSP: Fn() -> bool + Sync, - { - Self::run_extraction(document_changes, indexing_context, extractor_allocs, step) - } + selection } diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index 1c1605b66..dda46f24c 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -3,9 +3,10 @@ use std::collections::HashMap; use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use serde_json::Value; +use crate::attribute_patterns::PatternMatch; use crate::update::new::document::Document; use crate::update::new::extract::perm_json_p::{ - seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, Selection, + seek_leaf_values_in_array, seek_leaf_values_in_object, Depth, }; use crate::{ FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError, @@ -17,8 +18,6 @@ const MAX_DISTANCE: u32 = 8; pub struct DocumentTokenizer<'a> { pub tokenizer: &'a Tokenizer<'a>, - pub attribute_to_extract: Option<&'a [&'a str]>, - pub attribute_to_skip: &'a [&'a str], pub localized_attributes_rules: &'a [LocalizedAttributesRule], pub max_positions_per_attributes: u32, } @@ -31,87 +30,94 @@ impl<'a> DocumentTokenizer<'a> { token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>, ) -> Result<()> { let mut field_position = HashMap::new(); + let mut tokenize_field = |field_name: &str, _depth, value: &Value| { + let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else { + return Err(UserError::AttributeLimitReached.into()); + }; + + if meta.is_searchable() { + self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?; + } + + // todo: should be a match on the field_name using `match_field_legacy` function, + // but for legacy reasons we iterate over all the fields to fill the field_id_map. + Ok(PatternMatch::Match) + }; for entry in document.iter_top_level_fields() { let (field_name, value) = entry?; - - let mut tokenize_field = |field_name: &str, _depth, value: &Value| { - let Some(field_id) = field_id_map.id_or_insert(field_name) else { - return Err(UserError::AttributeLimitReached.into()); - }; - - if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) - != Selection::Select - { - return Ok(()); - } - - let position = field_position - .entry(field_id) - .and_modify(|counter| *counter += MAX_DISTANCE) - .or_insert(0); - if *position >= self.max_positions_per_attributes { - return Ok(()); - } - - let text; - let tokens = match value { - Value::Number(n) => { - text = n.to_string(); - self.tokenizer.tokenize(text.as_str()) - } - Value::Bool(b) => { - text = b.to_string(); - self.tokenizer.tokenize(text.as_str()) - } - Value::String(text) => { - let locales = self - .localized_attributes_rules - .iter() - .find(|rule| rule.match_str(field_name)) - .map(|rule| rule.locales()); - self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) - } - _ => return Ok(()), - }; - - // create an iterator of token with their positions. - let tokens = process_tokens(*position, tokens) - .take_while(|(p, _)| *p < self.max_positions_per_attributes); - - for (index, token) in tokens { - // keep a word only if it is not empty and fit in a LMDB key. - let token = token.lemma().trim(); - if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - *position = index; - if let Ok(position) = (*position).try_into() { - token_fn(field_name, field_id, position, token)?; - } - } - } - - Ok(()) - }; - // parse json. match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => seek_leaf_values_in_object( &object, - None, - &[], field_name, Depth::OnBaseKey, &mut tokenize_field, )?, Value::Array(array) => seek_leaf_values_in_array( &array, - None, - &[], field_name, Depth::OnBaseKey, &mut tokenize_field, )?, - value => tokenize_field(field_name, Depth::OnBaseKey, &value)?, + value => { + tokenize_field(field_name, Depth::OnBaseKey, &value)?; + } + } + } + + Ok(()) + } + + fn tokenize_field( + &self, + field_id: FieldId, + field_name: &str, + value: &Value, + token_fn: &mut impl FnMut(&str, u16, u16, &str) -> std::result::Result<(), crate::Error>, + field_position: &mut HashMap, + ) -> Result<()> { + let position = field_position + .entry(field_id) + .and_modify(|counter| *counter += MAX_DISTANCE) + .or_insert(0); + if *position >= self.max_positions_per_attributes { + return Ok(()); + } + + let text; + let tokens = match value { + Value::Number(n) => { + text = n.to_string(); + self.tokenizer.tokenize(text.as_str()) + } + Value::Bool(b) => { + text = b.to_string(); + self.tokenizer.tokenize(text.as_str()) + } + Value::String(text) => { + let locales = self + .localized_attributes_rules + .iter() + .find(|rule| rule.match_str(field_name) == PatternMatch::Match) + .map(|rule| rule.locales()); + self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) + } + _ => return Ok(()), + }; + + // create an iterator of token with their positions. + let tokens = process_tokens(*position, tokens) + .take_while(|(p, _)| *p < self.max_positions_per_attributes); + + for (index, token) in tokens { + // keep a word only if it is not empty and fit in a LMDB key. + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + *position = index; + if let Ok(position) = (*position).try_into() { + token_fn(field_name, field_id, position, token)?; + } } } @@ -215,15 +221,20 @@ mod test { let mut tb = TokenizerBuilder::default(); let document_tokenizer = DocumentTokenizer { tokenizer: &tb.build(), - attribute_to_extract: None, - attribute_to_skip: &["not-me", "me-nether.nope"], localized_attributes_rules: &[], max_positions_per_attributes: 1000, }; let fields_ids_map = FieldIdMapWithMetadata::new( fields_ids_map, - MetadataBuilder::new(Default::default(), Default::default(), Default::default(), None), + MetadataBuilder::new( + Default::default(), + Default::default(), + Default::default(), + None, + None, + Default::default(), + ), ); let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map); @@ -265,6 +276,10 @@ mod test { 2, 16, ]: "catto", + [ + 3, + 0, + ]: "unsearchable", [ 5, 0, @@ -277,6 +292,10 @@ mod test { 8, 0, ]: "23", + [ + 9, + 0, + ]: "unsearchable", } "###); } diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index f49cd834d..907a4d1df 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -199,7 +199,7 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); let _entered = span.enter(); - ::run_extraction( + WordPairProximityDocidsExtractor::run_extraction( document_changes, indexing_context, extractor_allocs,