From ba7f091db39049cf29a0afa33476ab731a9b7931 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 20 Nov 2024 08:28:24 +0100 Subject: [PATCH] Use tokenizer on numbers and booleans --- .../word_docids.snap | 2 +- .../initial/word_docids.snap | 9 ++-- .../updated/word_docids.snap | 9 ++-- .../extract/searchable/tokenize_document.rs | 54 +++++++++---------- 4 files changed, 40 insertions(+), 34 deletions(-) diff --git a/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap b/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap index 97c8e0729..4f7afa040 100644 --- a/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap +++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap @@ -2,7 +2,7 @@ source: crates/milli/src/update/index_documents/mod.rs --- 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] -2.2 [21, ] +2 [21, ] 36 [3, ] 37 [4, ] 38 [5, ] diff --git a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap index 89076a652..cf50a86bb 100644 --- a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap +++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap @@ -1,15 +1,18 @@ --- source: crates/milli/src/update/index_documents/mod.rs --- +0 [1, ] 1 [2, ] -10.0 [1, ] +10 [1, ] 1344 [3, ] 2 [0, ] -25.99 [2, ] -3.5 [0, ] +25 [2, ] +3 [0, ] 4 [4, ] 42 [5, ] 456 [1, ] +5 [0, ] +99 [2, ] adams [5, ] adventure [1, ] alice [2, ] diff --git a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap index b92b96a36..07bc86984 100644 --- a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap +++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap @@ -1,16 +1,19 @@ --- source: crates/milli/src/update/index_documents/mod.rs --- +0 [1, ] 1 [2, ] -10.0 [1, ] +10 [1, ] 1344 [3, ] 1813 [0, ] 2 [0, ] -25.99 [2, ] -3.5 [0, ] +25 [2, ] +3 [0, ] 4 [4, ] 42 [5, ] 456 [1, ] +5 [0, ] +99 [2, ] adams [5, ] adventure [1, ] alice [2, ] diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index 06f5479e4..4bfcfbf16 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -48,43 +48,43 @@ impl<'a> DocumentTokenizer<'a> { return Ok(()); } - match value { + let text; + let tokens = match value { Value::Number(n) => { - let token = n.to_string(); - if let Ok(position) = (*position).try_into() { - token_fn(field_name, field_id, position, token.as_str())?; - } - - Ok(()) + text = n.to_string(); + self.tokenizer.tokenize(text.as_str()) + } + Value::Bool(b) => { + text = b.to_string(); + self.tokenizer.tokenize(text.as_str()) } Value::String(text) => { - // create an iterator of token with their positions. let locales = self .localized_attributes_rules .iter() .find(|rule| rule.match_str(field_name)) .map(|rule| rule.locales()); - let tokens = process_tokens( - *position, - self.tokenizer.tokenize_with_allow_list(text.as_str(), locales), - ) - .take_while(|(p, _)| *p < self.max_positions_per_attributes); - - for (index, token) in tokens { - // keep a word only if it is not empty and fit in a LMDB key. - let token = token.lemma().trim(); - if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - *position = index; - if let Ok(position) = (*position).try_into() { - token_fn(field_name, field_id, position, token)?; - } - } - } - - Ok(()) + self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) + } + _ => return Ok(()), + }; + + // create an iterator of token with their positions. + let tokens = process_tokens(*position, tokens) + .take_while(|(p, _)| *p < self.max_positions_per_attributes); + + for (index, token) in tokens { + // keep a word only if it is not empty and fit in a LMDB key. + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + *position = index; + if let Ok(position) = (*position).try_into() { + token_fn(field_name, field_id, position, token)?; + } } - _ => Ok(()), } + + Ok(()) }; // if the current field is searchable or contains a searchable attribute