Introduce the facet extractors

This commit is contained in:
Clément Renault 2024-09-04 17:03:09 +02:00
parent 1d59c19cd2
commit 19d937ab21
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
9 changed files with 443 additions and 139 deletions

View File

@ -52,6 +52,7 @@ impl Deletion {
self.docid self.docid
} }
// TODO shouldn't we use the one in self?
pub fn current<'a>( pub fn current<'a>(
&self, &self,
rtxn: &'a RoTxn, rtxn: &'a RoTxn,

View File

@ -1,5 +1,5 @@
use std::num::NonZeroUsize;
use std::mem; use std::mem;
use std::num::NonZeroUsize;
use grenad::{MergeFunction, Sorter}; use grenad::{MergeFunction, Sorter};
use lru::LruCache; use lru::LruCache;
@ -10,16 +10,16 @@ use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::CboRoaringBitmapCodec; use crate::CboRoaringBitmapCodec;
#[derive(Debug)] #[derive(Debug)]
pub struct CachedSorter<MF> { pub struct CboCachedSorter<MF> {
cache: lru::LruCache<SmallVec<[u8; 20]>, DelAddRoaringBitmap>, cache: lru::LruCache<SmallVec<[u8; 20]>, DelAddRoaringBitmap>,
sorter: Sorter<MF>, sorter: Sorter<MF>,
deladd_buffer: Vec<u8>, deladd_buffer: Vec<u8>,
cbo_buffer: Vec<u8>, cbo_buffer: Vec<u8>,
} }
impl<MF> CachedSorter<MF> { impl<MF> CboCachedSorter<MF> {
pub fn new(cap: NonZeroUsize, sorter: Sorter<MF>) -> Self { pub fn new(cap: NonZeroUsize, sorter: Sorter<MF>) -> Self {
CachedSorter { CboCachedSorter {
cache: lru::LruCache::new(cap), cache: lru::LruCache::new(cap),
sorter, sorter,
deladd_buffer: Vec::new(), deladd_buffer: Vec::new(),
@ -28,7 +28,7 @@ impl<MF> CachedSorter<MF> {
} }
} }
impl<MF: MergeFunction> CachedSorter<MF> { impl<MF: MergeFunction> CboCachedSorter<MF> {
pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
match self.cache.get_mut(key) { match self.cache.get_mut(key) {
Some(DelAddRoaringBitmap { del, add: _ }) => { Some(DelAddRoaringBitmap { del, add: _ }) => {
@ -194,4 +194,4 @@ impl DelAddRoaringBitmap {
fn new_add_u32(n: u32) -> Self { fn new_add_u32(n: u32) -> Self {
DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) } DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) }
} }
} }

View File

@ -0,0 +1,271 @@
use std::collections::HashSet;
use std::fs::File;
use grenad::Merger;
use heed::RoTxn;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use serde_json::Value;
use super::cache::CboCachedSorter;
use super::perm_json_p;
use crate::facet::value_encoding::f64_into_bytes;
use crate::update::new::{DocumentChange, ItemsPool, KvReaderFieldId};
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
use crate::{
normalize_facet, FieldId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError,
MAX_FACET_VALUE_LENGTH,
};
pub trait FacetedExtractor {
fn run_extraction(
index: &Index,
fields_ids_map: &GlobalFieldsIdsMap,
indexer: GrenadParameters,
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
let max_memory = indexer.max_memory_by_thread();
let rtxn = index.read_txn()?;
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
let attributes_to_extract: Vec<_> =
attributes_to_extract.iter().map(|s| s.as_ref()).collect();
let context_pool = ItemsPool::new(|| {
Ok((
index.read_txn()?,
fields_ids_map.clone(),
Vec::new(),
CboCachedSorter::new(
// TODO use a better value
100.try_into().unwrap(),
create_sorter(
grenad::SortAlgorithm::Stable,
MergeDeladdCboRoaringBitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
),
),
))
});
document_changes.into_par_iter().try_for_each(|document_change| {
context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| {
Self::extract_document_change(
&*rtxn,
index,
buffer,
fields_ids_map,
&attributes_to_extract,
cached_sorter,
document_change?,
)
})
})?;
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
for (_rtxn, _fields_ids_map, _buffer, cache) in context_pool.into_items() {
let sorter = cache.into_sorter()?;
let readers = sorter.into_reader_cursors()?;
builder.extend(readers);
}
Ok(builder.build())
}
fn extract_document_change(
rtxn: &RoTxn,
index: &Index,
buffer: &mut Vec<u8>,
fields_ids_map: &mut GlobalFieldsIdsMap,
attributes_to_extract: &[&str],
cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
document_change: DocumentChange,
) -> Result<()> {
match document_change {
DocumentChange::Deletion(inner) => {
let mut facet_del_fn = |fid, value: &Value| -> Result<()> {
buffer.clear();
match Self::build_key(fid, value, buffer) {
// TODO manage errors
Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()),
None => Ok(()),
}
};
extract_document_facets(
attributes_to_extract,
inner.current(rtxn, index)?.unwrap(),
fields_ids_map,
&mut facet_del_fn,
)
}
DocumentChange::Update(inner) => {
let mut facet_del_fn = |fid, value: &Value| -> Result<()> {
buffer.clear();
match Self::build_key(fid, value, buffer) {
// TODO manage errors
Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()),
None => Ok(()),
}
};
extract_document_facets(
attributes_to_extract,
inner.current(rtxn, index)?.unwrap(),
fields_ids_map,
&mut facet_del_fn,
)?;
let mut facet_add_fn = |fid, value: &Value| -> Result<()> {
buffer.clear();
match Self::build_key(fid, value, buffer) {
// TODO manage errors
Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()),
None => Ok(()),
}
};
extract_document_facets(
attributes_to_extract,
inner.new(),
fields_ids_map,
&mut facet_add_fn,
)
}
DocumentChange::Insertion(inner) => {
let mut facet_add_fn = |fid, value: &Value| -> Result<()> {
buffer.clear();
match Self::build_key(fid, value, buffer) {
// TODO manage errors
Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()),
None => Ok(()),
}
};
extract_document_facets(
attributes_to_extract,
inner.new(),
fields_ids_map,
&mut facet_add_fn,
)
}
}
}
// TODO avoid owning the strings here.
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>>;
fn build_key<'b>(field_id: FieldId, value: &Value, output: &'b mut Vec<u8>)
-> Option<&'b [u8]>;
}
pub struct FieldIdFacetNumberDocidsExtractor;
impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
let number = value.as_number()?;
let n = number.as_f64()?;
let ordered = f64_into_bytes(n)?;
// fid - level - orderedf64 - orignalf64
output.extend_from_slice(&field_id.to_be_bytes());
output.push(1); // level 0
output.extend_from_slice(&ordered);
output.extend_from_slice(&n.to_be_bytes());
Some(&*output)
}
}
/// TODO It doesn't keep the original string in the value
pub struct FieldIdFacetStringDocidsExtractor;
impl FacetedExtractor for FieldIdFacetStringDocidsExtractor {
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
index.user_defined_faceted_fields(rtxn)
}
fn build_key<'b>(
field_id: FieldId,
value: &Value,
output: &'b mut Vec<u8>,
) -> Option<&'b [u8]> {
let string = value.as_str()?;
let normalize = normalize_facet(string);
let truncated = truncate_str(&normalize);
// fid - level - normalized string
output.extend_from_slice(&field_id.to_be_bytes());
output.push(1); // level 0
output.extend_from_slice(truncated.as_bytes());
Some(&*output)
}
}
pub fn extract_document_facets(
attributes_to_extract: &[&str],
obkv: &KvReaderFieldId,
field_id_map: &mut GlobalFieldsIdsMap,
facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
) -> Result<()> {
let mut field_name = String::new();
for (field_id, field_bytes) in obkv {
let Some(field_name) = field_id_map.name(field_id).map(|s| {
field_name.clear();
field_name.push_str(s);
&field_name
}) else {
unreachable!("field id not found in field id map");
};
let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {
Some(field_id) => facet_fn(field_id, value),
None => Err(UserError::AttributeLimitReached.into()),
};
// if the current field is searchable or contains a searchable attribute
if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
// parse json.
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
&object,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
&mut tokenize_field,
)?,
Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
&array,
Some(attributes_to_extract),
&[], // skip no attributes
field_name,
&mut tokenize_field,
)?,
value => tokenize_field(field_name, &value)?,
}
}
}
Ok(())
}
/// Truncates a string to the biggest valid LMDB key size.
fn truncate_str(s: &str) -> &str {
let index = s
.char_indices()
.map(|(idx, _)| idx)
.chain(std::iter::once(s.len()))
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
.last();
&s[..index.unwrap_or(0)]
}

View File

@ -1,7 +1,114 @@
mod cache; mod cache;
mod faceted;
mod searchable; mod searchable;
pub use faceted::FacetedExtractor;
pub use searchable::{ pub use searchable::{
ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor, ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
WordPositionDocidsExtractor, WordPositionDocidsExtractor,
}; };
/// TODO move in permissive json pointer
pub mod perm_json_p {
use serde_json::{Map, Value};
use crate::Result;
const SPLIT_SYMBOL: char = '.';
/// Returns `true` if the `selector` match the `key`.
///
/// ```text
/// Example:
/// `animaux` match `animaux`
/// `animaux.chien` match `animaux`
/// `animaux.chien` match `animaux`
/// `animaux.chien.nom` match `animaux`
/// `animaux.chien.nom` match `animaux.chien`
/// -----------------------------------------
/// `animaux` doesn't match `animaux.chien`
/// `animaux.` doesn't match `animaux`
/// `animaux.ch` doesn't match `animaux.chien`
/// `animau` doesn't match `animaux`
/// ```
pub fn contained_in(selector: &str, key: &str) -> bool {
selector.starts_with(key)
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
}
pub fn seek_leaf_values_in_object(
value: &Map<String, Value>,
selectors: Option<&[&str]>,
skip_selectors: &[&str],
base_key: &str,
seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
) -> Result<()> {
for (key, value) in value.iter() {
let base_key = if base_key.is_empty() {
key.to_string()
} else {
format!("{}{}{}", base_key, SPLIT_SYMBOL, key)
};
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
// so we check the contained_in on both side
let should_continue = select_field(&base_key, selectors, skip_selectors);
if should_continue {
match value {
Value::Object(object) => seek_leaf_values_in_object(
object,
selectors,
skip_selectors,
&base_key,
seeker,
),
Value::Array(array) => seek_leaf_values_in_array(
array,
selectors,
skip_selectors,
&base_key,
seeker,
),
value => seeker(&base_key, value),
}?;
}
}
Ok(())
}
pub fn seek_leaf_values_in_array(
values: &[Value],
selectors: Option<&[&str]>,
skip_selectors: &[&str],
base_key: &str,
seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
) -> Result<()> {
for value in values {
match value {
Value::Object(object) => {
seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker)
}
Value::Array(array) => {
seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker)
}
value => seeker(base_key, value),
}?;
}
Ok(())
}
pub fn select_field(
field_name: &str,
selectors: Option<&[&str]>,
skip_selectors: &[&str],
) -> bool {
selectors.map_or(true, |selectors| {
selectors.iter().any(|selector| {
contained_in(selector, &field_name) || contained_in(&field_name, selector)
})
}) && !skip_selectors.iter().any(|skip_selector| {
contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector)
})
}
}

View File

@ -19,6 +19,7 @@ impl SearchableExtractor for WordDocidsExtractor {
index.exact_attributes(rtxn).map_err(Into::into) index.exact_attributes(rtxn).map_err(Into::into)
} }
/// TODO write in an external Vec buffer
fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> { fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> {
Cow::Borrowed(word.as_bytes()) Cow::Borrowed(word.as_bytes())
} }

View File

@ -1,22 +1,22 @@
mod extract_word_docids; mod extract_word_docids;
mod tokenize_document; mod tokenize_document;
use std::borrow::Cow;
use std::fs::File;
pub use extract_word_docids::{ pub use extract_word_docids::{
ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor, ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
WordPositionDocidsExtractor, WordPositionDocidsExtractor,
}; };
use std::borrow::Cow;
use std::fs::File;
use grenad::Merger; use grenad::Merger;
use heed::RoTxn; use heed::RoTxn;
use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::iter::{IntoParallelIterator, ParallelIterator};
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
use super::cache::CachedSorter; use super::cache::CboCachedSorter;
use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::new::{DocumentChange, ItemsPool};
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
pub trait SearchableExtractor { pub trait SearchableExtractor {
fn run_extraction( fn run_extraction(
@ -60,7 +60,7 @@ pub trait SearchableExtractor {
index.read_txn()?, index.read_txn()?,
&document_tokenizer, &document_tokenizer,
fields_ids_map.clone(), fields_ids_map.clone(),
CachedSorter::new( CboCachedSorter::new(
// TODO use a better value // TODO use a better value
100.try_into().unwrap(), 100.try_into().unwrap(),
create_sorter( create_sorter(
@ -103,14 +103,16 @@ pub trait SearchableExtractor {
index: &Index, index: &Index,
document_tokenizer: &DocumentTokenizer, document_tokenizer: &DocumentTokenizer,
fields_ids_map: &mut GlobalFieldsIdsMap, fields_ids_map: &mut GlobalFieldsIdsMap,
cached_sorter: &mut CachedSorter<MergeDeladdCboRoaringBitmaps>, cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
document_change: DocumentChange, document_change: DocumentChange,
) -> Result<()> { ) -> Result<()> {
match document_change { match document_change {
DocumentChange::Deletion(inner) => { DocumentChange::Deletion(inner) => {
let mut token_fn = |fid, pos: u16, word: &str| { let mut token_fn = |fid, pos: u16, word: &str| {
let key = Self::build_key(fid, pos, word); let key = Self::build_key(fid, pos, word);
/// TODO manage the error
cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); cached_sorter.insert_del_u32(&key, inner.docid()).unwrap();
Ok(())
}; };
document_tokenizer.tokenize_document( document_tokenizer.tokenize_document(
inner.current(rtxn, index)?.unwrap(), inner.current(rtxn, index)?.unwrap(),
@ -121,7 +123,9 @@ pub trait SearchableExtractor {
DocumentChange::Update(inner) => { DocumentChange::Update(inner) => {
let mut token_fn = |fid, pos, word: &str| { let mut token_fn = |fid, pos, word: &str| {
let key = Self::build_key(fid, pos, word); let key = Self::build_key(fid, pos, word);
/// TODO manage the error
cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); cached_sorter.insert_del_u32(&key, inner.docid()).unwrap();
Ok(())
}; };
document_tokenizer.tokenize_document( document_tokenizer.tokenize_document(
inner.current(rtxn, index)?.unwrap(), inner.current(rtxn, index)?.unwrap(),
@ -131,14 +135,18 @@ pub trait SearchableExtractor {
let mut token_fn = |fid, pos, word: &str| { let mut token_fn = |fid, pos, word: &str| {
let key = Self::build_key(fid, pos, word); let key = Self::build_key(fid, pos, word);
/// TODO manage the error
cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); cached_sorter.insert_add_u32(&key, inner.docid()).unwrap();
Ok(())
}; };
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
} }
DocumentChange::Insertion(inner) => { DocumentChange::Insertion(inner) => {
let mut token_fn = |fid, pos, word: &str| { let mut token_fn = |fid, pos, word: &str| {
let key = Self::build_key(fid, pos, word); let key = Self::build_key(fid, pos, word);
/// TODO manage the error
cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); cached_sorter.insert_add_u32(&key, inner.docid()).unwrap();
Ok(())
}; };
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
} }
@ -152,5 +160,5 @@ pub trait SearchableExtractor {
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>; fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
fn build_key<'a>(field_id: FieldId, position: u16, word: &'a str) -> Cow<'a, [u8]>; fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>;
} }

View File

@ -1,13 +1,15 @@
use std::collections::HashMap; use std::collections::HashMap;
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
use heed::RoTxn;
use serde_json::Value; use serde_json::Value;
use crate::update::new::extract::perm_json_p::{
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
};
use crate::update::new::KvReaderFieldId; use crate::update::new::KvReaderFieldId;
use crate::{ use crate::{
FieldId, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule, FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, MAX_WORD_LENGTH,
}; };
pub struct DocumentTokenizer<'a> { pub struct DocumentTokenizer<'a> {
@ -23,7 +25,7 @@ impl<'a> DocumentTokenizer<'a> {
&self, &self,
obkv: &KvReaderFieldId, obkv: &KvReaderFieldId,
field_id_map: &mut GlobalFieldsIdsMap, field_id_map: &mut GlobalFieldsIdsMap,
token_fn: &mut impl FnMut(FieldId, u16, &str), token_fn: &mut impl FnMut(FieldId, u16, &str) -> Result<()>,
) -> Result<()> { ) -> Result<()> {
let mut field_position = HashMap::new(); let mut field_position = HashMap::new();
let mut field_name = String::new(); let mut field_name = String::new();
@ -38,22 +40,23 @@ impl<'a> DocumentTokenizer<'a> {
let mut tokenize_field = |name: &str, value: &Value| { let mut tokenize_field = |name: &str, value: &Value| {
let Some(field_id) = field_id_map.id_or_insert(name) else { let Some(field_id) = field_id_map.id_or_insert(name) else {
/// TODO: better error return Err(UserError::AttributeLimitReached.into());
panic!("it's over 9000");
}; };
let position = let position =
field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0); field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0);
if *position as u32 >= self.max_positions_per_attributes { if *position as u32 >= self.max_positions_per_attributes {
return; return Ok(());
} }
match value { match value {
Value::Number(n) => { Value::Number(n) => {
let token = n.to_string(); let token = n.to_string();
if let Ok(position) = (*position).try_into() { if let Ok(position) = (*position).try_into() {
token_fn(field_id, position, token.as_str()); token_fn(field_id, position, token.as_str())?;
} }
Ok(())
} }
Value::String(text) => { Value::String(text) => {
// create an iterator of token with their positions. // create an iterator of token with their positions.
@ -74,41 +77,40 @@ impl<'a> DocumentTokenizer<'a> {
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
*position = index; *position = index;
if let Ok(position) = (*position).try_into() { if let Ok(position) = (*position).try_into() {
token_fn(field_id, position, token); token_fn(field_id, position, token)?;
} }
} }
} }
Ok(())
} }
_ => (), _ => Ok(()),
} }
}; };
// if the current field is searchable or contains a searchable attribute // if the current field is searchable or contains a searchable attribute
if perm_json_p::select_field( if select_field(&field_name, self.attribute_to_extract, self.attribute_to_skip) {
&field_name,
self.attribute_to_extract.as_deref(),
self.attribute_to_skip,
) {
// parse json. // parse json.
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
Value::Object(object) => perm_json_p::seek_leaf_values_in_object( Value::Object(object) => seek_leaf_values_in_object(
&object, &object,
self.attribute_to_extract.as_deref(), self.attribute_to_extract,
self.attribute_to_skip, self.attribute_to_skip,
&field_name, &field_name,
&mut tokenize_field, &mut tokenize_field,
), )?,
Value::Array(array) => perm_json_p::seek_leaf_values_in_array( Value::Array(array) => seek_leaf_values_in_array(
&array, &array,
self.attribute_to_extract.as_deref(), self.attribute_to_extract,
self.attribute_to_skip, self.attribute_to_skip,
&field_name, &field_name,
&mut tokenize_field, &mut tokenize_field,
), )?,
value => tokenize_field(&field_name, &value), value => tokenize_field(&field_name, &value)?,
} }
} }
} }
Ok(()) Ok(())
} }
} }
@ -167,105 +169,6 @@ pub fn tokenizer_builder<'a>(
tokenizer_builder tokenizer_builder
} }
/// TODO move in permissive json pointer
mod perm_json_p {
use serde_json::{Map, Value};
const SPLIT_SYMBOL: char = '.';
/// Returns `true` if the `selector` match the `key`.
///
/// ```text
/// Example:
/// `animaux` match `animaux`
/// `animaux.chien` match `animaux`
/// `animaux.chien` match `animaux`
/// `animaux.chien.nom` match `animaux`
/// `animaux.chien.nom` match `animaux.chien`
/// -----------------------------------------
/// `animaux` doesn't match `animaux.chien`
/// `animaux.` doesn't match `animaux`
/// `animaux.ch` doesn't match `animaux.chien`
/// `animau` doesn't match `animaux`
/// ```
pub fn contained_in(selector: &str, key: &str) -> bool {
selector.starts_with(key)
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
}
pub fn seek_leaf_values_in_object(
value: &Map<String, Value>,
selectors: Option<&[&str]>,
skip_selectors: &[&str],
base_key: &str,
seeker: &mut impl FnMut(&str, &Value),
) {
for (key, value) in value.iter() {
let base_key = if base_key.is_empty() {
key.to_string()
} else {
format!("{}{}{}", base_key, SPLIT_SYMBOL, key)
};
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
// so we check the contained_in on both side
let should_continue = select_field(&base_key, selectors, skip_selectors);
if should_continue {
match value {
Value::Object(object) => seek_leaf_values_in_object(
object,
selectors,
skip_selectors,
&base_key,
seeker,
),
Value::Array(array) => seek_leaf_values_in_array(
array,
selectors,
skip_selectors,
&base_key,
seeker,
),
value => seeker(&base_key, value),
}
}
}
}
pub fn seek_leaf_values_in_array(
values: &[Value],
selectors: Option<&[&str]>,
skip_selectors: &[&str],
base_key: &str,
seeker: &mut impl FnMut(&str, &Value),
) {
for value in values {
match value {
Value::Object(object) => {
seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker)
}
Value::Array(array) => {
seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker)
}
value => seeker(base_key, value),
}
}
}
pub fn select_field(
field_name: &str,
selectors: Option<&[&str]>,
skip_selectors: &[&str],
) -> bool {
selectors.map_or(true, |selectors| {
selectors.iter().any(|selector| {
contained_in(selector, &field_name) || contained_in(&field_name, selector)
})
}) && !skip_selectors.iter().any(|skip_selector| {
contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector)
})
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use charabia::TokenizerBuilder; use charabia::TokenizerBuilder;
@ -274,6 +177,8 @@ mod test {
use serde_json::json; use serde_json::json;
use super::*; use super::*;
use crate::FieldsIdsMap;
#[test] #[test]
fn test_tokenize_document() { fn test_tokenize_document() {
let mut fields_ids_map = FieldsIdsMap::new(); let mut fields_ids_map = FieldsIdsMap::new();
@ -329,6 +234,7 @@ mod test {
document_tokenizer document_tokenizer
.tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| { .tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| {
words.insert([fid, pos], word.to_string()); words.insert([fid, pos], word.to_string());
Ok(())
}) })
.unwrap(); .unwrap();

View File

@ -127,6 +127,19 @@ where
&extractor_sender, &extractor_sender,
)?; )?;
// TODO THIS IS TOO MUCH
// Extract fieldid docid facet number
// Extract fieldid docid facet string
// Extract facetid string fst
// Extract fieldid facet isempty docids
// Extract fieldid facet isnull docids
// Extract fieldid facet exists docids
// TODO This is the normal system
// Extract fieldid facet number docids
// Extract fieldid facet string docids
Ok(()) as Result<_> Ok(()) as Result<_>
}) })
})?; })?;

View File

@ -1,7 +1,5 @@
use std::fs::File; use std::fs::File;
use std::io;
use fst::set::OpBuilder;
use fst::{Set, SetBuilder}; use fst::{Set, SetBuilder};
use grenad::Merger; use grenad::Merger;
use heed::types::Bytes; use heed::types::Bytes;
@ -15,7 +13,6 @@ use super::channel::{
WordFidDocids, WordPositionDocids, WordFidDocids, WordPositionDocids,
}; };
use super::KvReaderDelAdd; use super::KvReaderDelAdd;
use crate::index::main_key::WORDS_FST_KEY;
use crate::update::del_add::DelAdd; use crate::update::del_add::DelAdd;
use crate::update::new::channel::MergerOperation; use crate::update::new::channel::MergerOperation;
use crate::update::MergeDeladdCboRoaringBitmaps; use crate::update::MergeDeladdCboRoaringBitmaps;
@ -210,7 +207,7 @@ fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec
} }
/// TODO Return the slice directly from the serialize_into method /// TODO Return the slice directly from the serialize_into method
fn serialize_bitmap_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec<u8>) { fn serialize_bitmap_into_vec(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) {
buffer.clear(); buffer.clear();
bitmap.serialize_into(buffer).unwrap(); bitmap.serialize_into(buffer).unwrap();
// buffer.as_slice() // buffer.as_slice()