From ba7f091db39049cf29a0afa33476ab731a9b7931 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 20 Nov 2024 08:28:24 +0100
Subject: [PATCH] Use tokenizer on numbers and booleans

---
 .../word_docids.snap                          |  2 +-
 .../initial/word_docids.snap                  |  9 ++--
 .../updated/word_docids.snap                  |  9 ++--
 .../extract/searchable/tokenize_document.rs   | 54 +++++++++----------
 4 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap b/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap
index 97c8e0729..4f7afa040 100644
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap
@@ -2,7 +2,7 @@
 source: crates/milli/src/update/index_documents/mod.rs
 ---
 1                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
-2.2              [21, ]
+2                [21, ]
 36               [3, ]
 37               [4, ]
 38               [5, ]
diff --git a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap
index 89076a652..cf50a86bb 100644
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap
@@ -1,15 +1,18 @@
 ---
 source: crates/milli/src/update/index_documents/mod.rs
 ---
+0                [1, ]
 1                [2, ]
-10.0             [1, ]
+10               [1, ]
 1344             [3, ]
 2                [0, ]
-25.99            [2, ]
-3.5              [0, ]
+25               [2, ]
+3                [0, ]
 4                [4, ]
 42               [5, ]
 456              [1, ]
+5                [0, ]
+99               [2, ]
 adams            [5, ]
 adventure        [1, ]
 alice            [2, ]
diff --git a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
index b92b96a36..07bc86984 100644
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
@@ -1,16 +1,19 @@
 ---
 source: crates/milli/src/update/index_documents/mod.rs
 ---
+0                [1, ]
 1                [2, ]
-10.0             [1, ]
+10               [1, ]
 1344             [3, ]
 1813             [0, ]
 2                [0, ]
-25.99            [2, ]
-3.5              [0, ]
+25               [2, ]
+3                [0, ]
 4                [4, ]
 42               [5, ]
 456              [1, ]
+5                [0, ]
+99               [2, ]
 adams            [5, ]
 adventure        [1, ]
 alice            [2, ]
diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
index 06f5479e4..4bfcfbf16 100644
--- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
+++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
@@ -48,43 +48,43 @@ impl<'a> DocumentTokenizer<'a> {
                     return Ok(());
                 }
 
-                match value {
+                let text;
+                let tokens = match value {
                     Value::Number(n) => {
-                        let token = n.to_string();
-                        if let Ok(position) = (*position).try_into() {
-                            token_fn(field_name, field_id, position, token.as_str())?;
-                        }
-
-                        Ok(())
+                        text = n.to_string();
+                        self.tokenizer.tokenize(text.as_str())
+                    }
+                    Value::Bool(b) => {
+                        text = b.to_string();
+                        self.tokenizer.tokenize(text.as_str())
                     }
                     Value::String(text) => {
-                        // create an iterator of token with their positions.
                         let locales = self
                             .localized_attributes_rules
                             .iter()
                             .find(|rule| rule.match_str(field_name))
                             .map(|rule| rule.locales());
-                        let tokens = process_tokens(
-                            *position,
-                            self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
-                        )
-                        .take_while(|(p, _)| *p < self.max_positions_per_attributes);
-
-                        for (index, token) in tokens {
-                            // keep a word only if it is not empty and fit in a LMDB key.
-                            let token = token.lemma().trim();
-                            if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
-                                *position = index;
-                                if let Ok(position) = (*position).try_into() {
-                                    token_fn(field_name, field_id, position, token)?;
-                                }
-                            }
-                        }
-
-                        Ok(())
+                        self.tokenizer.tokenize_with_allow_list(text.as_str(), locales)
+                    }
+                    _ => return Ok(()),
+                };
+
+                // create an iterator of token with their positions.
+                let tokens = process_tokens(*position, tokens)
+                    .take_while(|(p, _)| *p < self.max_positions_per_attributes);
+
+                for (index, token) in tokens {
+                    // keep a word only if it is not empty and fit in a LMDB key.
+                    let token = token.lemma().trim();
+                    if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
+                        *position = index;
+                        if let Ok(position) = (*position).try_into() {
+                            token_fn(field_name, field_id, position, token)?;
+                        }
                     }
-                    _ => Ok(()),
                 }
+
+                Ok(())
             };
 
             // if the current field is searchable or contains a searchable attribute