Use tokenizer on numbers and booleans

This commit is contained in:
ManyTheFish 2024-11-20 08:28:24 +01:00 committed by Clément Renault
parent 8049df125b
commit ba7f091db3
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
4 changed files with 40 additions and 34 deletions

View File

@ -2,7 +2,7 @@
source: crates/milli/src/update/index_documents/mod.rs
---
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
2.2 [21, ]
2 [21, ]
36 [3, ]
37 [4, ]
38 [5, ]

View File

@ -1,15 +1,18 @@
---
source: crates/milli/src/update/index_documents/mod.rs
---
0 [1, ]
1 [2, ]
10.0 [1, ]
10 [1, ]
1344 [3, ]
2 [0, ]
25.99 [2, ]
3.5 [0, ]
25 [2, ]
3 [0, ]
4 [4, ]
42 [5, ]
456 [1, ]
5 [0, ]
99 [2, ]
adams [5, ]
adventure [1, ]
alice [2, ]

View File

@ -1,16 +1,19 @@
---
source: crates/milli/src/update/index_documents/mod.rs
---
0 [1, ]
1 [2, ]
10.0 [1, ]
10 [1, ]
1344 [3, ]
1813 [0, ]
2 [0, ]
25.99 [2, ]
3.5 [0, ]
25 [2, ]
3 [0, ]
4 [4, ]
42 [5, ]
456 [1, ]
5 [0, ]
99 [2, ]
adams [5, ]
adventure [1, ]
alice [2, ]

View File

@ -48,43 +48,43 @@ impl<'a> DocumentTokenizer<'a> {
return Ok(());
}
match value {
let text;
let tokens = match value {
Value::Number(n) => {
let token = n.to_string();
if let Ok(position) = (*position).try_into() {
token_fn(field_name, field_id, position, token.as_str())?;
}
Ok(())
text = n.to_string();
self.tokenizer.tokenize(text.as_str())
}
Value::Bool(b) => {
text = b.to_string();
self.tokenizer.tokenize(text.as_str())
}
Value::String(text) => {
// create an iterator of token with their positions.
let locales = self
.localized_attributes_rules
.iter()
.find(|rule| rule.match_str(field_name))
.map(|rule| rule.locales());
let tokens = process_tokens(
*position,
self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
)
.take_while(|(p, _)| *p < self.max_positions_per_attributes);
for (index, token) in tokens {
// keep a word only if it is not empty and fit in a LMDB key.
let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
*position = index;
if let Ok(position) = (*position).try_into() {
token_fn(field_name, field_id, position, token)?;
}
}
}
Ok(())
self.tokenizer.tokenize_with_allow_list(text.as_str(), locales)
}
_ => return Ok(()),
};
// create an iterator of token with their positions.
let tokens = process_tokens(*position, tokens)
.take_while(|(p, _)| *p < self.max_positions_per_attributes);
for (index, token) in tokens {
// keep a word only if it is not empty and fit in a LMDB key.
let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
*position = index;
if let Ok(position) = (*position).try_into() {
token_fn(field_name, field_id, position, token)?;
}
}
_ => Ok(()),
}
Ok(())
};
// if the current field is searchable or contains a searchable attribute