Use tokenizer on numbers and booleans

2024-11-22 10:07:40 +08:00 · 2024-11-20 08:28:24 +01:00 · 2024-11-20 08:28:24 +01:00 · cb226079fa
commit cb226079fa
parent 6f00c1dfb8
4 changed files with 40 additions and 34 deletions
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap
@ -2,7 +2,7 @@
 source: crates/milli/src/update/index_documents/mod.rs
 ---
 1                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
-2.2              [21, ]
+2                [21, ]
 36               [3, ]
 37               [4, ]
 38               [5, ]
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap
@ -1,15 +1,18 @@
 ---
 source: crates/milli/src/update/index_documents/mod.rs
 ---
+0                [1, ]
 1                [2, ]
-10.0             [1, ]
+10               [1, ]
 1344             [3, ]
 2                [0, ]
-25.99            [2, ]
-3.5              [0, ]
+25               [2, ]
+3                [0, ]
 4                [4, ]
 42               [5, ]
 456              [1, ]
+5                [0, ]
+99               [2, ]
 adams            [5, ]
 adventure        [1, ]
 alice            [2, ]
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
@ -1,16 +1,19 @@
 ---
 source: crates/milli/src/update/index_documents/mod.rs
 ---
+0                [1, ]
 1                [2, ]
-10.0             [1, ]
+10               [1, ]
 1344             [3, ]
 1813             [0, ]
 2                [0, ]
-25.99            [2, ]
-3.5              [0, ]
+25               [2, ]
+3                [0, ]
 4                [4, ]
 42               [5, ]
 456              [1, ]
+5                [0, ]
+99               [2, ]
 adams            [5, ]
 adventure        [1, ]
 alice            [2, ]
--- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
+++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
@ -48,43 +48,43 @@ impl<'a> DocumentTokenizer<'a> {
                    return Ok(());
                }

-                match value {
+                let text;
+                let tokens = match value {
                    Value::Number(n) => {
-                        let token = n.to_string();
-                        if let Ok(position) = (*position).try_into() {
-                            token_fn(field_name, field_id, position, token.as_str())?;
-                        }
-
-                        Ok(())
+                        text = n.to_string();
+                        self.tokenizer.tokenize(text.as_str())
+                    }
+                    Value::Bool(b) => {
+                        text = b.to_string();
+                        self.tokenizer.tokenize(text.as_str())
                    }
                    Value::String(text) => {
-                        // create an iterator of token with their positions.
                        let locales = self
                            .localized_attributes_rules
                            .iter()
                            .find(|rule| rule.match_str(field_name))
                            .map(|rule| rule.locales());
-                        let tokens = process_tokens(
-                            *position,
-                            self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
-                        )
-                        .take_while(|(p, _)| *p < self.max_positions_per_attributes);
-
-                        for (index, token) in tokens {
-                            // keep a word only if it is not empty and fit in a LMDB key.
-                            let token = token.lemma().trim();
-                            if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
-                                *position = index;
-                                if let Ok(position) = (*position).try_into() {
-                                    token_fn(field_name, field_id, position, token)?;
-                                }
-                            }
-                        }
-
-                        Ok(())
+                        self.tokenizer.tokenize_with_allow_list(text.as_str(), locales)
+                    }
+                    _ => return Ok(()),
+                };
+
+                // create an iterator of token with their positions.
+                let tokens = process_tokens(*position, tokens)
+                    .take_while(|(p, _)| *p < self.max_positions_per_attributes);
+
+                for (index, token) in tokens {
+                    // keep a word only if it is not empty and fit in a LMDB key.
+                    let token = token.lemma().trim();
+                    if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
+                        *position = index;
+                        if let Ok(position) = (*position).try_into() {
+                            token_fn(field_name, field_id, position, token)?;
+                        }
                    }
-                    _ => Ok(()),
                }
+
+                Ok(())
            };

            // if the current field is searchable or contains a searchable attribute