Refactor Document indexing process (searchables)

**Changes:** The searchable database extraction is now relying on the AttributePatterns and FieldIdMapWithMetadata to match the field to extract. Remove the SearchableExtractor trait to make the code less complex. **Impact:** - Document Addition/modification searchable indexing - Document deletion searchable indexing
2025-03-06 22:02:34 +08:00 · 2025-03-03 10:32:42 +01:00 · 2025-03-03 10:32:42 +01:00 · ae8d453868
commit ae8d453868
parent 95bccaf5f5
5 changed files with 239 additions and 263 deletions
--- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
@ -5,8 +5,8 @@ use std::ops::DerefMut as _;
 use bumpalo::collections::vec::Vec as BumpVec;
 use bumpalo::Bump;
 use heed::RoTxn;
 use super::match_searchable_field;
 use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
 use crate::update::new::extract::cache::BalancedCaches;
 use crate::update::new::extract::perm_json_p::contained_in;
@ -17,8 +17,7 @@ use crate::update::new::ref_cell_ext::RefCellExt as _;
 use crate::update::new::steps::IndexingStep;
 use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
 use crate::update::new::DocumentChange;
-use crate::update::GrenadParameters;
+use crate::{bucketed_position, DocumentId, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
 use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
 const MAX_COUNTED_WORDS: usize = 30;
@ -207,9 +206,10 @@ impl<'extractor> WordDocidsCaches<'extractor> {
 }
 pub struct WordDocidsExtractorData<'a> {
-    tokenizer: &'a DocumentTokenizer<'a>,
+    tokenizer: DocumentTokenizer<'a>,
-    grenad_parameters: &'a GrenadParameters,
+    max_memory_by_thread: Option<usize>,
    buckets: usize,
    searchable_attributes: Option<Vec<&'a str>>,
 }
 impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
@ -218,7 +218,7 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
        Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
            self.buckets,
-            self.grenad_parameters.max_memory_by_thread(),
+            self.max_memory_by_thread,
            extractor_alloc,
        ))))
    }
@ -230,7 +230,12 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
    ) -> Result<()> {
        for change in changes {
            let change = change?;
-            WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?;
+            WordDocidsExtractors::extract_document_change(
                context,
                &self.tokenizer,
                self.searchable_attributes.as_deref(),
                change,
            )?;
        }
        Ok(())
    }
@ -248,52 +253,42 @@ impl WordDocidsExtractors {
    where
        MSP: Fn() -> bool + Sync,
    {
-        let index = indexing_context.index;
+        // Warning: this is duplicated code from extract_word_pair_proximity_docids.rs
-        let rtxn = index.read_txn()?;
+        let rtxn = indexing_context.index.read_txn()?;
-
+        let stop_words = indexing_context.index.stop_words(&rtxn)?;
-        let stop_words = index.stop_words(&rtxn)?;
+        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
        let allowed_separators = index.allowed_separators(&rtxn)?;
        let allowed_separators: Option<Vec<_>> =
            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
-        let dictionary = index.dictionary(&rtxn)?;
+        let dictionary = indexing_context.index.dictionary(&rtxn)?;
        let dictionary: Option<Vec<_>> =
            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
-        let builder = tokenizer_builder(
+        let mut builder = tokenizer_builder(
            stop_words.as_ref(),
            allowed_separators.as_deref(),
            dictionary.as_deref(),
        );
-        let tokenizer = builder.into_tokenizer();
+        let tokenizer = builder.build();
        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
        let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
        let localized_attributes_rules =
-            index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
+            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
        let document_tokenizer = DocumentTokenizer {
            tokenizer: &tokenizer,
            attribute_to_extract: attributes_to_extract.as_deref(),
            attribute_to_skip: attributes_to_skip.as_slice(),
            localized_attributes_rules: &localized_attributes_rules,
            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
        };
-
+        let extractor_data = WordDocidsExtractorData {
            tokenizer: document_tokenizer,
            max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(),
            buckets: rayon::current_num_threads(),
            searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?,
        };
        let datastore = ThreadLocal::new();
        {
            let span =
                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
            let _entered = span.enter();
            let extractor = WordDocidsExtractorData {
                tokenizer: &document_tokenizer,
                grenad_parameters: indexing_context.grenad_parameters,
                buckets: rayon::current_num_threads(),
            };
            extract(
                document_changes,
-                &extractor,
+                &extractor_data,
                indexing_context,
                extractor_allocs,
                &datastore,
@ -312,6 +307,7 @@ impl WordDocidsExtractors {
    fn extract_document_change(
        context: &DocumentChangeContext<RefCell<Option<WordDocidsBalancedCaches>>>,
        document_tokenizer: &DocumentTokenizer,
        searchable_attributes: Option<&[&str]>,
        document_change: DocumentChange,
    ) -> Result<()> {
        let index = &context.index;
@ -345,7 +341,9 @@ impl WordDocidsExtractors {
            }
            DocumentChange::Update(inner) => {
                if !inner.has_changed_for_fields(
-                    document_tokenizer.attribute_to_extract,
+                    &mut |field_name: &str| {
                        match_searchable_field(field_name, searchable_attributes)
                    },
                    &context.rtxn,
                    context.index,
                    context.db_fields_ids_map,
@ -408,15 +406,4 @@ impl WordDocidsExtractors {
        let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc);
        cached_sorter.flush_fid_word_count(&mut buffer)
    }
    fn attributes_to_extract<'a>(
        rtxn: &'a RoTxn,
        index: &'a Index,
    ) -> Result<Option<Vec<&'a str>>> {
        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
    }
    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
        Ok(Vec::new())
    }
 }
--- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@ -2,30 +2,114 @@ use std::cell::RefCell;
 use std::collections::VecDeque;
 use std::rc::Rc;
-use heed::RoTxn;
+use bumpalo::Bump;
-use super::tokenize_document::DocumentTokenizer;
+use super::match_searchable_field;
-use super::SearchableExtractor;
+use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
 use crate::proximity::{index_proximity, MAX_DISTANCE};
 use crate::update::new::document::Document;
 use crate::update::new::extract::cache::BalancedCaches;
-use crate::update::new::indexer::document_changes::DocumentChangeContext;
+use crate::update::new::indexer::document_changes::{
    extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
 };
 use crate::update::new::ref_cell_ext::RefCellExt as _;
 use crate::update::new::steps::IndexingStep;
 use crate::update::new::thread_local::{FullySend, ThreadLocal};
 use crate::update::new::DocumentChange;
-use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
+use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE};
 pub struct WordPairProximityDocidsExtractorData<'a> {
    tokenizer: DocumentTokenizer<'a>,
    searchable_attributes: Option<Vec<&'a str>>,
    max_memory_by_thread: Option<usize>,
    buckets: usize,
 }
 impl<'a, 'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractorData<'a> {
    type Data = RefCell<BalancedCaches<'extractor>>;
    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
        Ok(RefCell::new(BalancedCaches::new_in(
            self.buckets,
            self.max_memory_by_thread,
            extractor_alloc,
        )))
    }
    fn process<'doc>(
        &self,
        changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
        context: &DocumentChangeContext<Self::Data>,
    ) -> Result<()> {
        for change in changes {
            let change = change?;
            WordPairProximityDocidsExtractor::extract_document_change(
                context,
                &self.tokenizer,
                self.searchable_attributes.as_deref(),
                change,
            )?;
        }
        Ok(())
    }
 }
 pub struct WordPairProximityDocidsExtractor;
-impl SearchableExtractor for WordPairProximityDocidsExtractor {
+impl WordPairProximityDocidsExtractor {
-    fn attributes_to_extract<'a>(
+    pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
-        rtxn: &'a RoTxn,
+        document_changes: &DC,
-        index: &'a Index,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
-    ) -> Result<Option<Vec<&'a str>>> {
+        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
-        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+        step: IndexingStep,
-    }
+    ) -> Result<Vec<BalancedCaches<'extractor>>>
    where
        MSP: Fn() -> bool + Sync,
    {
        // Warning: this is duplicated code from extract_word_docids.rs
        let rtxn = indexing_context.index.read_txn()?;
        let stop_words = indexing_context.index.stop_words(&rtxn)?;
        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
        let allowed_separators: Option<Vec<_>> =
            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
        let dictionary = indexing_context.index.dictionary(&rtxn)?;
        let dictionary: Option<Vec<_>> =
            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
        let mut builder = tokenizer_builder(
            stop_words.as_ref(),
            allowed_separators.as_deref(),
            dictionary.as_deref(),
        );
        let tokenizer = builder.build();
        let localized_attributes_rules =
            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
        let document_tokenizer = DocumentTokenizer {
            tokenizer: &tokenizer,
            localized_attributes_rules: &localized_attributes_rules,
            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
        };
        let extractor_data = WordPairProximityDocidsExtractorData {
            tokenizer: document_tokenizer,
            searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?,
            max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(),
            buckets: rayon::current_num_threads(),
        };
        let datastore = ThreadLocal::new();
        {
            let span =
                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
            let _entered = span.enter();
            extract(
                document_changes,
                &extractor_data,
                indexing_context,
                extractor_allocs,
                &datastore,
                step,
            )?;
        }
-    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
        Ok(Vec::new())
    }
    // This method is reimplemented to count the number of words in the document in each field
@ -34,6 +118,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
    fn extract_document_change(
        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
        document_tokenizer: &DocumentTokenizer,
        searchable_attributes: Option<&[&str]>,
        document_change: DocumentChange,
    ) -> Result<()> {
        let doc_alloc = &context.doc_alloc;
@ -71,7 +156,9 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
            }
            DocumentChange::Update(inner) => {
                if !inner.has_changed_for_fields(
-                    document_tokenizer.attribute_to_extract,
+                    &mut |field_name: &str| {
                        match_searchable_field(field_name, searchable_attributes)
                    },
                    rtxn,
                    index,
                    context.db_fields_ids_map,
--- a/crates/milli/src/update/new/extract/searchable/mod.rs
+++ b/crates/milli/src/update/new/extract/searchable/mod.rs
@ -2,145 +2,28 @@ mod extract_word_docids;
 mod extract_word_pair_proximity_docids;
 mod tokenize_document;
 use std::cell::RefCell;
 use std::marker::PhantomData;
 use bumpalo::Bump;
 pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors};
 pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
 use heed::RoTxn;
 use tokenize_document::{tokenizer_builder, DocumentTokenizer};
-use super::cache::BalancedCaches;
+use crate::attribute_patterns::{match_field_legacy, PatternMatch};
 use super::DocidsExtractor;
 use crate::update::new::indexer::document_changes::{
    extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
 };
 use crate::update::new::steps::IndexingStep;
 use crate::update::new::thread_local::{FullySend, ThreadLocal};
 use crate::update::new::DocumentChange;
 use crate::update::GrenadParameters;
 use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
-pub struct SearchableExtractorData<'a, EX: SearchableExtractor> {
+pub fn match_searchable_field(
-    tokenizer: &'a DocumentTokenizer<'a>,
+    field_name: &str,
-    grenad_parameters: &'a GrenadParameters,
+    searchable_fields: Option<&[&str]>,
-    buckets: usize,
+) -> PatternMatch {
-    _ex: PhantomData<EX>,
+    let Some(searchable_fields) = searchable_fields else {
-}
+        // If no searchable fields are provided, consider all fields as searchable
        return PatternMatch::Match;
    };
-impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
+    let mut selection = PatternMatch::NoMatch;
-    for SearchableExtractorData<'a, EX>
+    for pattern in searchable_fields {
-{
+        match match_field_legacy(pattern, field_name) {
-    type Data = RefCell<BalancedCaches<'extractor>>;
+            PatternMatch::Match => return PatternMatch::Match,
-
+            PatternMatch::Parent => selection = PatternMatch::Parent,
-    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
+            PatternMatch::NoMatch => (),
        Ok(RefCell::new(BalancedCaches::new_in(
            self.buckets,
            self.grenad_parameters.max_memory_by_thread(),
            extractor_alloc,
        )))
    }
    fn process<'doc>(
        &self,
        changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
        context: &DocumentChangeContext<Self::Data>,
    ) -> Result<()> {
        for change in changes {
            let change = change?;
            EX::extract_document_change(context, self.tokenizer, change)?;
        }
        Ok(())
    }
 }
 pub trait SearchableExtractor: Sized + Sync {
    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
        document_changes: &DC,
        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
        step: IndexingStep,
    ) -> Result<Vec<BalancedCaches<'extractor>>>
    where
        MSP: Fn() -> bool + Sync,
    {
        let rtxn = indexing_context.index.read_txn()?;
        let stop_words = indexing_context.index.stop_words(&rtxn)?;
        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
        let allowed_separators: Option<Vec<_>> =
            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
        let dictionary = indexing_context.index.dictionary(&rtxn)?;
        let dictionary: Option<Vec<_>> =
            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
        let mut builder = tokenizer_builder(
            stop_words.as_ref(),
            allowed_separators.as_deref(),
            dictionary.as_deref(),
        );
        let tokenizer = builder.build();
        let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
        let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
        let localized_attributes_rules =
            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
        let document_tokenizer = DocumentTokenizer {
            tokenizer: &tokenizer,
            attribute_to_extract: attributes_to_extract.as_deref(),
            attribute_to_skip: attributes_to_skip.as_slice(),
            localized_attributes_rules: &localized_attributes_rules,
            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
        };
        let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
            tokenizer: &document_tokenizer,
            grenad_parameters: indexing_context.grenad_parameters,
            buckets: rayon::current_num_threads(),
            _ex: PhantomData,
        };
        let datastore = ThreadLocal::new();
        {
            let span =
                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
            let _entered = span.enter();
            extract(
                document_changes,
                &extractor_data,
                indexing_context,
                extractor_allocs,
                &datastore,
                step,
            )?;
        }
        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
    }
-    fn extract_document_change(
+    selection
        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
        document_tokenizer: &DocumentTokenizer,
        document_change: DocumentChange,
    ) -> Result<()>;
    fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
        -> Result<Option<Vec<&'a str>>>;
    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
 }
 impl<T: SearchableExtractor> DocidsExtractor for T {
    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
        document_changes: &DC,
        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
        step: IndexingStep,
    ) -> Result<Vec<BalancedCaches<'extractor>>>
    where
        MSP: Fn() -> bool + Sync,
    {
        Self::run_extraction(document_changes, indexing_context, extractor_allocs, step)
    }
 }
--- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
+++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
@ -3,9 +3,10 @@ use std::collections::HashMap;
 use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
 use serde_json::Value;
 use crate::attribute_patterns::PatternMatch;
 use crate::update::new::document::Document;
 use crate::update::new::extract::perm_json_p::{
-    seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, Selection,
+    seek_leaf_values_in_array, seek_leaf_values_in_object, Depth,
 };
 use crate::{
    FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
@ -17,8 +18,6 @@ const MAX_DISTANCE: u32 = 8;
 pub struct DocumentTokenizer<'a> {
    pub tokenizer: &'a Tokenizer<'a>,
    pub attribute_to_extract: Option<&'a [&'a str]>,
    pub attribute_to_skip: &'a [&'a str],
    pub localized_attributes_rules: &'a [LocalizedAttributesRule],
    pub max_positions_per_attributes: u32,
 }
@ -31,87 +30,94 @@ impl<'a> DocumentTokenizer<'a> {
        token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
    ) -> Result<()> {
        let mut field_position = HashMap::new();
        let mut tokenize_field = |field_name: &str, _depth, value: &Value| {
            let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else {
                return Err(UserError::AttributeLimitReached.into());
            };
            if meta.is_searchable() {
                self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?;
            }
            // todo: should be a match on the field_name using `match_field_legacy` function,
            // but for legacy reasons we iterate over all the fields to fill the field_id_map.
            Ok(PatternMatch::Match)
        };
        for entry in document.iter_top_level_fields() {
            let (field_name, value) = entry?;
            let mut tokenize_field = |field_name: &str, _depth, value: &Value| {
                let Some(field_id) = field_id_map.id_or_insert(field_name) else {
                    return Err(UserError::AttributeLimitReached.into());
                };
                if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip)
                    != Selection::Select
                {
                    return Ok(());
                }
                let position = field_position
                    .entry(field_id)
                    .and_modify(|counter| *counter += MAX_DISTANCE)
                    .or_insert(0);
                if *position >= self.max_positions_per_attributes {
                    return Ok(());
                }
                let text;
                let tokens = match value {
                    Value::Number(n) => {
                        text = n.to_string();
                        self.tokenizer.tokenize(text.as_str())
                    }
                    Value::Bool(b) => {
                        text = b.to_string();
                        self.tokenizer.tokenize(text.as_str())
                    }
                    Value::String(text) => {
                        let locales = self
                            .localized_attributes_rules
                            .iter()
                            .find(|rule| rule.match_str(field_name))
                            .map(|rule| rule.locales());
                        self.tokenizer.tokenize_with_allow_list(text.as_str(), locales)
                    }
                    _ => return Ok(()),
                };
                // create an iterator of token with their positions.
                let tokens = process_tokens(*position, tokens)
                    .take_while(|(p, _)| *p < self.max_positions_per_attributes);
                for (index, token) in tokens {
                    // keep a word only if it is not empty and fit in a LMDB key.
                    let token = token.lemma().trim();
                    if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
                        *position = index;
                        if let Ok(position) = (*position).try_into() {
                            token_fn(field_name, field_id, position, token)?;
                        }
                    }
                }
                Ok(())
            };
            // parse json.
            match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
                Value::Object(object) => seek_leaf_values_in_object(
                    &object,
                    None,
                    &[],
                    field_name,
                    Depth::OnBaseKey,
                    &mut tokenize_field,
                )?,
                Value::Array(array) => seek_leaf_values_in_array(
                    &array,
                    None,
                    &[],
                    field_name,
                    Depth::OnBaseKey,
                    &mut tokenize_field,
                )?,
-                value => tokenize_field(field_name, Depth::OnBaseKey, &value)?,
+                value => {
                    tokenize_field(field_name, Depth::OnBaseKey, &value)?;
                }
            }
        }
        Ok(())
    }
    fn tokenize_field(
        &self,
        field_id: FieldId,
        field_name: &str,
        value: &Value,
        token_fn: &mut impl FnMut(&str, u16, u16, &str) -> std::result::Result<(), crate::Error>,
        field_position: &mut HashMap<u16, u32>,
    ) -> Result<()> {
        let position = field_position
            .entry(field_id)
            .and_modify(|counter| *counter += MAX_DISTANCE)
            .or_insert(0);
        if *position >= self.max_positions_per_attributes {
            return Ok(());
        }
        let text;
        let tokens = match value {
            Value::Number(n) => {
                text = n.to_string();
                self.tokenizer.tokenize(text.as_str())
            }
            Value::Bool(b) => {
                text = b.to_string();
                self.tokenizer.tokenize(text.as_str())
            }
            Value::String(text) => {
                let locales = self
                    .localized_attributes_rules
                    .iter()
                    .find(|rule| rule.match_str(field_name) == PatternMatch::Match)
                    .map(|rule| rule.locales());
                self.tokenizer.tokenize_with_allow_list(text.as_str(), locales)
            }
            _ => return Ok(()),
        };
        // create an iterator of token with their positions.
        let tokens = process_tokens(*position, tokens)
            .take_while(|(p, _)| *p < self.max_positions_per_attributes);
        for (index, token) in tokens {
            // keep a word only if it is not empty and fit in a LMDB key.
            let token = token.lemma().trim();
            if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
                *position = index;
                if let Ok(position) = (*position).try_into() {
                    token_fn(field_name, field_id, position, token)?;
                }
            }
        }
@ -215,15 +221,20 @@ mod test {
        let mut tb = TokenizerBuilder::default();
        let document_tokenizer = DocumentTokenizer {
            tokenizer: &tb.build(),
            attribute_to_extract: None,
            attribute_to_skip: &["not-me", "me-nether.nope"],
            localized_attributes_rules: &[],
            max_positions_per_attributes: 1000,
        };
        let fields_ids_map = FieldIdMapWithMetadata::new(
            fields_ids_map,
-            MetadataBuilder::new(Default::default(), Default::default(), Default::default(), None),
+            MetadataBuilder::new(
                Default::default(),
                Default::default(),
                Default::default(),
                None,
                None,
                Default::default(),
            ),
        );
        let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map);
@ -265,6 +276,10 @@ mod test {
                2,
                16,
            ]: "catto",
            [
                3,
                0,
            ]: "unsearchable",
            [
                5,
                0,
@ -277,6 +292,10 @@ mod test {
                8,
                0,
            ]: "23",
            [
                9,
                0,
            ]: "unsearchable",
        }
        "###);
    }
--- a/crates/milli/src/update/new/indexer/extract.rs
+++ b/crates/milli/src/update/new/indexer/extract.rs
@ -199,7 +199,7 @@ where
            let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
            let _entered = span.enter();
-            <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(
+            WordPairProximityDocidsExtractor::run_extraction(
                document_changes,
                indexing_context,
                extractor_allocs,