Compare commits

...

2 Commits

Author SHA1 Message Date
ManyTheFish
ff4b3578bf Fix index_documents_check_exists_database 2024-11-20 09:47:02 +01:00
ManyTheFish
cb226079fa Use tokenizer on numbers and booleans 2024-11-20 08:28:24 +01:00
6 changed files with 143 additions and 78 deletions

View File

@ -2,7 +2,7 @@
source: crates/milli/src/update/index_documents/mod.rs
---
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
2.2 [21, ]
2 [21, ]
36 [3, ]
37 [4, ]
38 [5, ]

View File

@ -1,15 +1,18 @@
---
source: crates/milli/src/update/index_documents/mod.rs
---
0 [1, ]
1 [2, ]
10.0 [1, ]
10 [1, ]
1344 [3, ]
2 [0, ]
25.99 [2, ]
3.5 [0, ]
25 [2, ]
3 [0, ]
4 [4, ]
42 [5, ]
456 [1, ]
5 [0, ]
99 [2, ]
adams [5, ]
adventure [1, ]
alice [2, ]

View File

@ -1,16 +1,19 @@
---
source: crates/milli/src/update/index_documents/mod.rs
---
0 [1, ]
1 [2, ]
10.0 [1, ]
10 [1, ]
1344 [3, ]
1813 [0, ]
2 [0, ]
25.99 [2, ]
3.5 [0, ]
25 [2, ]
3 [0, ]
4 [4, ]
42 [5, ]
456 [1, ]
5 [0, ]
99 [2, ]
adams [5, ]
adventure [1, ]
alice [2, ]

View File

@ -24,25 +24,46 @@ pub fn extract_document_facets<'doc>(
};
// if the current field is searchable or contains a searchable attribute
if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
let selection = perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]);
if selection != perm_json_p::Selection::Skip {
// parse json.
match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? {
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
&object,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
perm_json_p::Depth::OnBaseKey,
&mut tokenize_field,
)?,
Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
&array,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
perm_json_p::Depth::OnBaseKey,
&mut tokenize_field,
)?,
Value::Object(object) => {
perm_json_p::seek_leaf_values_in_object(
&object,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
perm_json_p::Depth::OnBaseKey,
&mut tokenize_field,
)?;
if selection == perm_json_p::Selection::Select {
tokenize_field(
field_name,
perm_json_p::Depth::OnBaseKey,
&Value::Object(object),
)?;
}
}
Value::Array(array) => {
perm_json_p::seek_leaf_values_in_array(
&array,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
perm_json_p::Depth::OnBaseKey,
&mut tokenize_field,
)?;
if selection == perm_json_p::Selection::Select {
tokenize_field(
field_name,
perm_json_p::Depth::OnBaseKey,
&Value::Array(array),
)?;
}
}
value => tokenize_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?,
}
}

View File

@ -88,25 +88,37 @@ pub mod perm_json_p {
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
// so we check the contained_in on both side
let should_continue = select_field(&base_key, selectors, skip_selectors);
if should_continue {
let selection = select_field(&base_key, selectors, skip_selectors);
if selection != Selection::Skip {
match value {
Value::Object(object) => seek_leaf_values_in_object(
object,
selectors,
skip_selectors,
&base_key,
Depth::OnBaseKey,
seeker,
),
Value::Array(array) => seek_leaf_values_in_array(
array,
selectors,
skip_selectors,
&base_key,
Depth::OnBaseKey,
seeker,
),
Value::Object(object) => {
if selection == Selection::Select {
seeker(&base_key, Depth::OnBaseKey, value)?;
}
seek_leaf_values_in_object(
object,
selectors,
skip_selectors,
&base_key,
Depth::OnBaseKey,
seeker,
)
}
Value::Array(array) => {
if selection == Selection::Select {
seeker(&base_key, Depth::OnBaseKey, value)?;
}
seek_leaf_values_in_array(
array,
selectors,
skip_selectors,
&base_key,
Depth::OnBaseKey,
seeker,
)
}
value => seeker(&base_key, Depth::OnBaseKey, value),
}?;
}
@ -156,13 +168,37 @@ pub mod perm_json_p {
field_name: &str,
selectors: Option<&[&str]>,
skip_selectors: &[&str],
) -> bool {
selectors.map_or(true, |selectors| {
selectors.iter().any(|selector| {
contained_in(selector, field_name) || contained_in(field_name, selector)
})
}) && !skip_selectors.iter().any(|skip_selector| {
) -> Selection {
if skip_selectors.iter().any(|skip_selector| {
contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector)
})
}) {
Selection::Skip
} else if let Some(selectors) = selectors {
selectors
.iter()
.filter_map(|selector| {
if contained_in(field_name, selector) {
Some(Selection::Select)
} else if contained_in(selector, field_name) {
Some(Selection::Parent)
} else {
None
}
})
.next()
.unwrap_or(Selection::Skip)
} else {
Selection::Select
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Selection {
/// The field is a parent of the of a nested field that must be selected
Parent,
/// The field must be selected
Select,
/// The field must be skipped
Skip,
}
}

View File

@ -5,7 +5,7 @@ use serde_json::Value;
use crate::update::new::document::Document;
use crate::update::new::extract::perm_json_p::{
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth,
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, Selection,
};
use crate::{
FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
@ -48,47 +48,49 @@ impl<'a> DocumentTokenizer<'a> {
return Ok(());
}
match value {
let text;
let tokens = match value {
Value::Number(n) => {
let token = n.to_string();
if let Ok(position) = (*position).try_into() {
token_fn(field_name, field_id, position, token.as_str())?;
}
Ok(())
text = n.to_string();
self.tokenizer.tokenize(text.as_str())
}
Value::Bool(b) => {
text = b.to_string();
self.tokenizer.tokenize(text.as_str())
}
Value::String(text) => {
// create an iterator of token with their positions.
let locales = self
.localized_attributes_rules
.iter()
.find(|rule| rule.match_str(field_name))
.map(|rule| rule.locales());
let tokens = process_tokens(
*position,
self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
)
.take_while(|(p, _)| *p < self.max_positions_per_attributes);
for (index, token) in tokens {
// keep a word only if it is not empty and fit in a LMDB key.
let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
*position = index;
if let Ok(position) = (*position).try_into() {
token_fn(field_name, field_id, position, token)?;
}
}
}
Ok(())
self.tokenizer.tokenize_with_allow_list(text.as_str(), locales)
}
_ => return Ok(()),
};
// create an iterator of token with their positions.
let tokens = process_tokens(*position, tokens)
.take_while(|(p, _)| *p < self.max_positions_per_attributes);
for (index, token) in tokens {
// keep a word only if it is not empty and fit in a LMDB key.
let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
*position = index;
if let Ok(position) = (*position).try_into() {
token_fn(field_name, field_id, position, token)?;
}
}
_ => Ok(()),
}
Ok(())
};
// if the current field is searchable or contains a searchable attribute
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) {
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip)
!= Selection::Skip
{
// parse json.
match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
Value::Object(object) => seek_leaf_values_in_object(