Use tokenizer on numbers and booleans

This commit is contained in:
ManyTheFish 2024-11-20 08:28:24 +01:00
parent 6f00c1dfb8
commit cb226079fa
4 changed files with 40 additions and 34 deletions

View File

@ -2,7 +2,7 @@
source: crates/milli/src/update/index_documents/mod.rs source: crates/milli/src/update/index_documents/mod.rs
--- ---
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
2.2 [21, ] 2 [21, ]
36 [3, ] 36 [3, ]
37 [4, ] 37 [4, ]
38 [5, ] 38 [5, ]

View File

@ -1,15 +1,18 @@
--- ---
source: crates/milli/src/update/index_documents/mod.rs source: crates/milli/src/update/index_documents/mod.rs
--- ---
0 [1, ]
1 [2, ] 1 [2, ]
10.0 [1, ] 10 [1, ]
1344 [3, ] 1344 [3, ]
2 [0, ] 2 [0, ]
25.99 [2, ] 25 [2, ]
3.5 [0, ] 3 [0, ]
4 [4, ] 4 [4, ]
42 [5, ] 42 [5, ]
456 [1, ] 456 [1, ]
5 [0, ]
99 [2, ]
adams [5, ] adams [5, ]
adventure [1, ] adventure [1, ]
alice [2, ] alice [2, ]

View File

@ -1,16 +1,19 @@
--- ---
source: crates/milli/src/update/index_documents/mod.rs source: crates/milli/src/update/index_documents/mod.rs
--- ---
0 [1, ]
1 [2, ] 1 [2, ]
10.0 [1, ] 10 [1, ]
1344 [3, ] 1344 [3, ]
1813 [0, ] 1813 [0, ]
2 [0, ] 2 [0, ]
25.99 [2, ] 25 [2, ]
3.5 [0, ] 3 [0, ]
4 [4, ] 4 [4, ]
42 [5, ] 42 [5, ]
456 [1, ] 456 [1, ]
5 [0, ]
99 [2, ]
adams [5, ] adams [5, ]
adventure [1, ] adventure [1, ]
alice [2, ] alice [2, ]

View File

@ -48,43 +48,43 @@ impl<'a> DocumentTokenizer<'a> {
return Ok(()); return Ok(());
} }
match value { let text;
let tokens = match value {
Value::Number(n) => { Value::Number(n) => {
let token = n.to_string(); text = n.to_string();
if let Ok(position) = (*position).try_into() { self.tokenizer.tokenize(text.as_str())
token_fn(field_name, field_id, position, token.as_str())?; }
} Value::Bool(b) => {
text = b.to_string();
Ok(()) self.tokenizer.tokenize(text.as_str())
} }
Value::String(text) => { Value::String(text) => {
// create an iterator of token with their positions.
let locales = self let locales = self
.localized_attributes_rules .localized_attributes_rules
.iter() .iter()
.find(|rule| rule.match_str(field_name)) .find(|rule| rule.match_str(field_name))
.map(|rule| rule.locales()); .map(|rule| rule.locales());
let tokens = process_tokens( self.tokenizer.tokenize_with_allow_list(text.as_str(), locales)
*position, }
self.tokenizer.tokenize_with_allow_list(text.as_str(), locales), _ => return Ok(()),
) };
.take_while(|(p, _)| *p < self.max_positions_per_attributes);
// create an iterator of token with their positions.
for (index, token) in tokens { let tokens = process_tokens(*position, tokens)
// keep a word only if it is not empty and fit in a LMDB key. .take_while(|(p, _)| *p < self.max_positions_per_attributes);
let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { for (index, token) in tokens {
*position = index; // keep a word only if it is not empty and fit in a LMDB key.
if let Ok(position) = (*position).try_into() { let token = token.lemma().trim();
token_fn(field_name, field_id, position, token)?; if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
} *position = index;
} if let Ok(position) = (*position).try_into() {
} token_fn(field_name, field_id, position, token)?;
}
Ok(())
} }
_ => Ok(()),
} }
Ok(())
}; };
// if the current field is searchable or contains a searchable attribute // if the current field is searchable or contains a searchable attribute