Compare commits

...

2 Commits

Author SHA1 Message Date
ManyTheFish
ff4b3578bf Fix index_documents_check_exists_database 2024-11-20 09:47:02 +01:00
ManyTheFish
cb226079fa Use tokenizer on numbers and booleans 2024-11-20 08:28:24 +01:00
6 changed files with 143 additions and 78 deletions

View File

@ -2,7 +2,7 @@
source: crates/milli/src/update/index_documents/mod.rs source: crates/milli/src/update/index_documents/mod.rs
--- ---
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
2.2 [21, ] 2 [21, ]
36 [3, ] 36 [3, ]
37 [4, ] 37 [4, ]
38 [5, ] 38 [5, ]

View File

@ -1,15 +1,18 @@
--- ---
source: crates/milli/src/update/index_documents/mod.rs source: crates/milli/src/update/index_documents/mod.rs
--- ---
0 [1, ]
1 [2, ] 1 [2, ]
10.0 [1, ] 10 [1, ]
1344 [3, ] 1344 [3, ]
2 [0, ] 2 [0, ]
25.99 [2, ] 25 [2, ]
3.5 [0, ] 3 [0, ]
4 [4, ] 4 [4, ]
42 [5, ] 42 [5, ]
456 [1, ] 456 [1, ]
5 [0, ]
99 [2, ]
adams [5, ] adams [5, ]
adventure [1, ] adventure [1, ]
alice [2, ] alice [2, ]

View File

@ -1,16 +1,19 @@
--- ---
source: crates/milli/src/update/index_documents/mod.rs source: crates/milli/src/update/index_documents/mod.rs
--- ---
0 [1, ]
1 [2, ] 1 [2, ]
10.0 [1, ] 10 [1, ]
1344 [3, ] 1344 [3, ]
1813 [0, ] 1813 [0, ]
2 [0, ] 2 [0, ]
25.99 [2, ] 25 [2, ]
3.5 [0, ] 3 [0, ]
4 [4, ] 4 [4, ]
42 [5, ] 42 [5, ]
456 [1, ] 456 [1, ]
5 [0, ]
99 [2, ]
adams [5, ] adams [5, ]
adventure [1, ] adventure [1, ]
alice [2, ] alice [2, ]

View File

@ -24,25 +24,46 @@ pub fn extract_document_facets<'doc>(
}; };
// if the current field is searchable or contains a searchable attribute // if the current field is searchable or contains a searchable attribute
if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) { let selection = perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]);
if selection != perm_json_p::Selection::Skip {
// parse json. // parse json.
match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? { match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? {
Value::Object(object) => perm_json_p::seek_leaf_values_in_object( Value::Object(object) => {
&object, perm_json_p::seek_leaf_values_in_object(
Some(attributes_to_extract), &object,
&[], // skip no attributes Some(attributes_to_extract),
field_name, &[], // skip no attributes
perm_json_p::Depth::OnBaseKey, field_name,
&mut tokenize_field, perm_json_p::Depth::OnBaseKey,
)?, &mut tokenize_field,
Value::Array(array) => perm_json_p::seek_leaf_values_in_array( )?;
&array,
Some(attributes_to_extract), if selection == perm_json_p::Selection::Select {
&[], // skip no attributes tokenize_field(
field_name, field_name,
perm_json_p::Depth::OnBaseKey, perm_json_p::Depth::OnBaseKey,
&mut tokenize_field, &Value::Object(object),
)?, )?;
}
}
Value::Array(array) => {
perm_json_p::seek_leaf_values_in_array(
&array,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
perm_json_p::Depth::OnBaseKey,
&mut tokenize_field,
)?;
if selection == perm_json_p::Selection::Select {
tokenize_field(
field_name,
perm_json_p::Depth::OnBaseKey,
&Value::Array(array),
)?;
}
}
value => tokenize_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, value => tokenize_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?,
} }
} }

View File

@ -88,25 +88,37 @@ pub mod perm_json_p {
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo` // here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
// so we check the contained_in on both side // so we check the contained_in on both side
let should_continue = select_field(&base_key, selectors, skip_selectors); let selection = select_field(&base_key, selectors, skip_selectors);
if should_continue { if selection != Selection::Skip {
match value { match value {
Value::Object(object) => seek_leaf_values_in_object( Value::Object(object) => {
object, if selection == Selection::Select {
selectors, seeker(&base_key, Depth::OnBaseKey, value)?;
skip_selectors, }
&base_key,
Depth::OnBaseKey, seek_leaf_values_in_object(
seeker, object,
), selectors,
Value::Array(array) => seek_leaf_values_in_array( skip_selectors,
array, &base_key,
selectors, Depth::OnBaseKey,
skip_selectors, seeker,
&base_key, )
Depth::OnBaseKey, }
seeker, Value::Array(array) => {
), if selection == Selection::Select {
seeker(&base_key, Depth::OnBaseKey, value)?;
}
seek_leaf_values_in_array(
array,
selectors,
skip_selectors,
&base_key,
Depth::OnBaseKey,
seeker,
)
}
value => seeker(&base_key, Depth::OnBaseKey, value), value => seeker(&base_key, Depth::OnBaseKey, value),
}?; }?;
} }
@ -156,13 +168,37 @@ pub mod perm_json_p {
field_name: &str, field_name: &str,
selectors: Option<&[&str]>, selectors: Option<&[&str]>,
skip_selectors: &[&str], skip_selectors: &[&str],
) -> bool { ) -> Selection {
selectors.map_or(true, |selectors| { if skip_selectors.iter().any(|skip_selector| {
selectors.iter().any(|selector| {
contained_in(selector, field_name) || contained_in(field_name, selector)
})
}) && !skip_selectors.iter().any(|skip_selector| {
contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector) contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector)
}) }) {
Selection::Skip
} else if let Some(selectors) = selectors {
selectors
.iter()
.filter_map(|selector| {
if contained_in(field_name, selector) {
Some(Selection::Select)
} else if contained_in(selector, field_name) {
Some(Selection::Parent)
} else {
None
}
})
.next()
.unwrap_or(Selection::Skip)
} else {
Selection::Select
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Selection {
/// The field is a parent of the of a nested field that must be selected
Parent,
/// The field must be selected
Select,
/// The field must be skipped
Skip,
} }
} }

View File

@ -5,7 +5,7 @@ use serde_json::Value;
use crate::update::new::document::Document; use crate::update::new::document::Document;
use crate::update::new::extract::perm_json_p::{ use crate::update::new::extract::perm_json_p::{
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, Selection,
}; };
use crate::{ use crate::{
FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError, FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
@ -48,47 +48,49 @@ impl<'a> DocumentTokenizer<'a> {
return Ok(()); return Ok(());
} }
match value { let text;
let tokens = match value {
Value::Number(n) => { Value::Number(n) => {
let token = n.to_string(); text = n.to_string();
if let Ok(position) = (*position).try_into() { self.tokenizer.tokenize(text.as_str())
token_fn(field_name, field_id, position, token.as_str())?; }
} Value::Bool(b) => {
text = b.to_string();
Ok(()) self.tokenizer.tokenize(text.as_str())
} }
Value::String(text) => { Value::String(text) => {
// create an iterator of token with their positions.
let locales = self let locales = self
.localized_attributes_rules .localized_attributes_rules
.iter() .iter()
.find(|rule| rule.match_str(field_name)) .find(|rule| rule.match_str(field_name))
.map(|rule| rule.locales()); .map(|rule| rule.locales());
let tokens = process_tokens( self.tokenizer.tokenize_with_allow_list(text.as_str(), locales)
*position, }
self.tokenizer.tokenize_with_allow_list(text.as_str(), locales), _ => return Ok(()),
) };
.take_while(|(p, _)| *p < self.max_positions_per_attributes);
// create an iterator of token with their positions.
for (index, token) in tokens { let tokens = process_tokens(*position, tokens)
// keep a word only if it is not empty and fit in a LMDB key. .take_while(|(p, _)| *p < self.max_positions_per_attributes);
let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { for (index, token) in tokens {
*position = index; // keep a word only if it is not empty and fit in a LMDB key.
if let Ok(position) = (*position).try_into() { let token = token.lemma().trim();
token_fn(field_name, field_id, position, token)?; if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
} *position = index;
} if let Ok(position) = (*position).try_into() {
} token_fn(field_name, field_id, position, token)?;
}
Ok(())
} }
_ => Ok(()),
} }
Ok(())
}; };
// if the current field is searchable or contains a searchable attribute // if the current field is searchable or contains a searchable attribute
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) { if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip)
!= Selection::Skip
{
// parse json. // parse json.
match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
Value::Object(object) => seek_leaf_values_in_object( Value::Object(object) => seek_leaf_values_in_object(