mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-25 19:45:05 +08:00
Introduce the facet extractors
This commit is contained in:
parent
1d59c19cd2
commit
19d937ab21
@ -52,6 +52,7 @@ impl Deletion {
|
|||||||
self.docid
|
self.docid
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO shouldn't we use the one in self?
|
||||||
pub fn current<'a>(
|
pub fn current<'a>(
|
||||||
&self,
|
&self,
|
||||||
rtxn: &'a RoTxn,
|
rtxn: &'a RoTxn,
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::num::NonZeroUsize;
|
|
||||||
use std::mem;
|
use std::mem;
|
||||||
|
use std::num::NonZeroUsize;
|
||||||
|
|
||||||
use grenad::{MergeFunction, Sorter};
|
use grenad::{MergeFunction, Sorter};
|
||||||
use lru::LruCache;
|
use lru::LruCache;
|
||||||
@ -10,16 +10,16 @@ use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
|||||||
use crate::CboRoaringBitmapCodec;
|
use crate::CboRoaringBitmapCodec;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct CachedSorter<MF> {
|
pub struct CboCachedSorter<MF> {
|
||||||
cache: lru::LruCache<SmallVec<[u8; 20]>, DelAddRoaringBitmap>,
|
cache: lru::LruCache<SmallVec<[u8; 20]>, DelAddRoaringBitmap>,
|
||||||
sorter: Sorter<MF>,
|
sorter: Sorter<MF>,
|
||||||
deladd_buffer: Vec<u8>,
|
deladd_buffer: Vec<u8>,
|
||||||
cbo_buffer: Vec<u8>,
|
cbo_buffer: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<MF> CachedSorter<MF> {
|
impl<MF> CboCachedSorter<MF> {
|
||||||
pub fn new(cap: NonZeroUsize, sorter: Sorter<MF>) -> Self {
|
pub fn new(cap: NonZeroUsize, sorter: Sorter<MF>) -> Self {
|
||||||
CachedSorter {
|
CboCachedSorter {
|
||||||
cache: lru::LruCache::new(cap),
|
cache: lru::LruCache::new(cap),
|
||||||
sorter,
|
sorter,
|
||||||
deladd_buffer: Vec::new(),
|
deladd_buffer: Vec::new(),
|
||||||
@ -28,7 +28,7 @@ impl<MF> CachedSorter<MF> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<MF: MergeFunction> CachedSorter<MF> {
|
impl<MF: MergeFunction> CboCachedSorter<MF> {
|
||||||
pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
|
pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
|
||||||
match self.cache.get_mut(key) {
|
match self.cache.get_mut(key) {
|
||||||
Some(DelAddRoaringBitmap { del, add: _ }) => {
|
Some(DelAddRoaringBitmap { del, add: _ }) => {
|
||||||
|
271
milli/src/update/new/extract/faceted/mod.rs
Normal file
271
milli/src/update/new/extract/faceted/mod.rs
Normal file
@ -0,0 +1,271 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
|
use std::fs::File;
|
||||||
|
|
||||||
|
use grenad::Merger;
|
||||||
|
use heed::RoTxn;
|
||||||
|
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use super::cache::CboCachedSorter;
|
||||||
|
use super::perm_json_p;
|
||||||
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
|
use crate::update::new::{DocumentChange, ItemsPool, KvReaderFieldId};
|
||||||
|
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
|
use crate::{
|
||||||
|
normalize_facet, FieldId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError,
|
||||||
|
MAX_FACET_VALUE_LENGTH,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub trait FacetedExtractor {
|
||||||
|
fn run_extraction(
|
||||||
|
index: &Index,
|
||||||
|
fields_ids_map: &GlobalFieldsIdsMap,
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
||||||
|
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
||||||
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn()?;
|
||||||
|
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
|
||||||
|
let attributes_to_extract: Vec<_> =
|
||||||
|
attributes_to_extract.iter().map(|s| s.as_ref()).collect();
|
||||||
|
|
||||||
|
let context_pool = ItemsPool::new(|| {
|
||||||
|
Ok((
|
||||||
|
index.read_txn()?,
|
||||||
|
fields_ids_map.clone(),
|
||||||
|
Vec::new(),
|
||||||
|
CboCachedSorter::new(
|
||||||
|
// TODO use a better value
|
||||||
|
100.try_into().unwrap(),
|
||||||
|
create_sorter(
|
||||||
|
grenad::SortAlgorithm::Stable,
|
||||||
|
MergeDeladdCboRoaringBitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
))
|
||||||
|
});
|
||||||
|
|
||||||
|
document_changes.into_par_iter().try_for_each(|document_change| {
|
||||||
|
context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| {
|
||||||
|
Self::extract_document_change(
|
||||||
|
&*rtxn,
|
||||||
|
index,
|
||||||
|
buffer,
|
||||||
|
fields_ids_map,
|
||||||
|
&attributes_to_extract,
|
||||||
|
cached_sorter,
|
||||||
|
document_change?,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||||
|
for (_rtxn, _fields_ids_map, _buffer, cache) in context_pool.into_items() {
|
||||||
|
let sorter = cache.into_sorter()?;
|
||||||
|
let readers = sorter.into_reader_cursors()?;
|
||||||
|
builder.extend(readers);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(builder.build())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_document_change(
|
||||||
|
rtxn: &RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||||
|
attributes_to_extract: &[&str],
|
||||||
|
cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
|
document_change: DocumentChange,
|
||||||
|
) -> Result<()> {
|
||||||
|
match document_change {
|
||||||
|
DocumentChange::Deletion(inner) => {
|
||||||
|
let mut facet_del_fn = |fid, value: &Value| -> Result<()> {
|
||||||
|
buffer.clear();
|
||||||
|
match Self::build_key(fid, value, buffer) {
|
||||||
|
// TODO manage errors
|
||||||
|
Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()),
|
||||||
|
None => Ok(()),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
extract_document_facets(
|
||||||
|
attributes_to_extract,
|
||||||
|
inner.current(rtxn, index)?.unwrap(),
|
||||||
|
fields_ids_map,
|
||||||
|
&mut facet_del_fn,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
DocumentChange::Update(inner) => {
|
||||||
|
let mut facet_del_fn = |fid, value: &Value| -> Result<()> {
|
||||||
|
buffer.clear();
|
||||||
|
match Self::build_key(fid, value, buffer) {
|
||||||
|
// TODO manage errors
|
||||||
|
Some(key) => Ok(cached_sorter.insert_del_u32(&key, inner.docid()).unwrap()),
|
||||||
|
None => Ok(()),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
extract_document_facets(
|
||||||
|
attributes_to_extract,
|
||||||
|
inner.current(rtxn, index)?.unwrap(),
|
||||||
|
fields_ids_map,
|
||||||
|
&mut facet_del_fn,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let mut facet_add_fn = |fid, value: &Value| -> Result<()> {
|
||||||
|
buffer.clear();
|
||||||
|
match Self::build_key(fid, value, buffer) {
|
||||||
|
// TODO manage errors
|
||||||
|
Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()),
|
||||||
|
None => Ok(()),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
extract_document_facets(
|
||||||
|
attributes_to_extract,
|
||||||
|
inner.new(),
|
||||||
|
fields_ids_map,
|
||||||
|
&mut facet_add_fn,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
DocumentChange::Insertion(inner) => {
|
||||||
|
let mut facet_add_fn = |fid, value: &Value| -> Result<()> {
|
||||||
|
buffer.clear();
|
||||||
|
match Self::build_key(fid, value, buffer) {
|
||||||
|
// TODO manage errors
|
||||||
|
Some(key) => Ok(cached_sorter.insert_add_u32(&key, inner.docid()).unwrap()),
|
||||||
|
None => Ok(()),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
extract_document_facets(
|
||||||
|
attributes_to_extract,
|
||||||
|
inner.new(),
|
||||||
|
fields_ids_map,
|
||||||
|
&mut facet_add_fn,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO avoid owning the strings here.
|
||||||
|
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>>;
|
||||||
|
|
||||||
|
fn build_key<'b>(field_id: FieldId, value: &Value, output: &'b mut Vec<u8>)
|
||||||
|
-> Option<&'b [u8]>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FieldIdFacetNumberDocidsExtractor;
|
||||||
|
impl FacetedExtractor for FieldIdFacetNumberDocidsExtractor {
|
||||||
|
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
|
||||||
|
index.user_defined_faceted_fields(rtxn)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_key<'b>(
|
||||||
|
field_id: FieldId,
|
||||||
|
value: &Value,
|
||||||
|
output: &'b mut Vec<u8>,
|
||||||
|
) -> Option<&'b [u8]> {
|
||||||
|
let number = value.as_number()?;
|
||||||
|
let n = number.as_f64()?;
|
||||||
|
let ordered = f64_into_bytes(n)?;
|
||||||
|
|
||||||
|
// fid - level - orderedf64 - orignalf64
|
||||||
|
output.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
output.push(1); // level 0
|
||||||
|
output.extend_from_slice(&ordered);
|
||||||
|
output.extend_from_slice(&n.to_be_bytes());
|
||||||
|
|
||||||
|
Some(&*output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// TODO It doesn't keep the original string in the value
|
||||||
|
pub struct FieldIdFacetStringDocidsExtractor;
|
||||||
|
impl FacetedExtractor for FieldIdFacetStringDocidsExtractor {
|
||||||
|
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
|
||||||
|
index.user_defined_faceted_fields(rtxn)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_key<'b>(
|
||||||
|
field_id: FieldId,
|
||||||
|
value: &Value,
|
||||||
|
output: &'b mut Vec<u8>,
|
||||||
|
) -> Option<&'b [u8]> {
|
||||||
|
let string = value.as_str()?;
|
||||||
|
let normalize = normalize_facet(string);
|
||||||
|
let truncated = truncate_str(&normalize);
|
||||||
|
|
||||||
|
// fid - level - normalized string
|
||||||
|
output.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
output.push(1); // level 0
|
||||||
|
output.extend_from_slice(truncated.as_bytes());
|
||||||
|
|
||||||
|
Some(&*output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn extract_document_facets(
|
||||||
|
attributes_to_extract: &[&str],
|
||||||
|
obkv: &KvReaderFieldId,
|
||||||
|
field_id_map: &mut GlobalFieldsIdsMap,
|
||||||
|
facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut field_name = String::new();
|
||||||
|
for (field_id, field_bytes) in obkv {
|
||||||
|
let Some(field_name) = field_id_map.name(field_id).map(|s| {
|
||||||
|
field_name.clear();
|
||||||
|
field_name.push_str(s);
|
||||||
|
&field_name
|
||||||
|
}) else {
|
||||||
|
unreachable!("field id not found in field id map");
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {
|
||||||
|
Some(field_id) => facet_fn(field_id, value),
|
||||||
|
None => Err(UserError::AttributeLimitReached.into()),
|
||||||
|
};
|
||||||
|
|
||||||
|
// if the current field is searchable or contains a searchable attribute
|
||||||
|
if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
|
||||||
|
// parse json.
|
||||||
|
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
||||||
|
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
|
||||||
|
&object,
|
||||||
|
Some(attributes_to_extract),
|
||||||
|
&[], // skip no attributes
|
||||||
|
field_name,
|
||||||
|
&mut tokenize_field,
|
||||||
|
)?,
|
||||||
|
Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
|
||||||
|
&array,
|
||||||
|
Some(attributes_to_extract),
|
||||||
|
&[], // skip no attributes
|
||||||
|
field_name,
|
||||||
|
&mut tokenize_field,
|
||||||
|
)?,
|
||||||
|
value => tokenize_field(field_name, &value)?,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Truncates a string to the biggest valid LMDB key size.
|
||||||
|
fn truncate_str(s: &str) -> &str {
|
||||||
|
let index = s
|
||||||
|
.char_indices()
|
||||||
|
.map(|(idx, _)| idx)
|
||||||
|
.chain(std::iter::once(s.len()))
|
||||||
|
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
|
||||||
|
.last();
|
||||||
|
|
||||||
|
&s[..index.unwrap_or(0)]
|
||||||
|
}
|
@ -1,7 +1,114 @@
|
|||||||
mod cache;
|
mod cache;
|
||||||
|
mod faceted;
|
||||||
mod searchable;
|
mod searchable;
|
||||||
|
|
||||||
|
pub use faceted::FacetedExtractor;
|
||||||
pub use searchable::{
|
pub use searchable::{
|
||||||
ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
|
ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
|
||||||
WordPositionDocidsExtractor,
|
WordPositionDocidsExtractor,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// TODO move in permissive json pointer
|
||||||
|
pub mod perm_json_p {
|
||||||
|
use serde_json::{Map, Value};
|
||||||
|
|
||||||
|
use crate::Result;
|
||||||
|
const SPLIT_SYMBOL: char = '.';
|
||||||
|
|
||||||
|
/// Returns `true` if the `selector` match the `key`.
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
|
/// Example:
|
||||||
|
/// `animaux` match `animaux`
|
||||||
|
/// `animaux.chien` match `animaux`
|
||||||
|
/// `animaux.chien` match `animaux`
|
||||||
|
/// `animaux.chien.nom` match `animaux`
|
||||||
|
/// `animaux.chien.nom` match `animaux.chien`
|
||||||
|
/// -----------------------------------------
|
||||||
|
/// `animaux` doesn't match `animaux.chien`
|
||||||
|
/// `animaux.` doesn't match `animaux`
|
||||||
|
/// `animaux.ch` doesn't match `animaux.chien`
|
||||||
|
/// `animau` doesn't match `animaux`
|
||||||
|
/// ```
|
||||||
|
pub fn contained_in(selector: &str, key: &str) -> bool {
|
||||||
|
selector.starts_with(key)
|
||||||
|
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn seek_leaf_values_in_object(
|
||||||
|
value: &Map<String, Value>,
|
||||||
|
selectors: Option<&[&str]>,
|
||||||
|
skip_selectors: &[&str],
|
||||||
|
base_key: &str,
|
||||||
|
seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
|
||||||
|
) -> Result<()> {
|
||||||
|
for (key, value) in value.iter() {
|
||||||
|
let base_key = if base_key.is_empty() {
|
||||||
|
key.to_string()
|
||||||
|
} else {
|
||||||
|
format!("{}{}{}", base_key, SPLIT_SYMBOL, key)
|
||||||
|
};
|
||||||
|
|
||||||
|
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
|
||||||
|
// so we check the contained_in on both side
|
||||||
|
let should_continue = select_field(&base_key, selectors, skip_selectors);
|
||||||
|
if should_continue {
|
||||||
|
match value {
|
||||||
|
Value::Object(object) => seek_leaf_values_in_object(
|
||||||
|
object,
|
||||||
|
selectors,
|
||||||
|
skip_selectors,
|
||||||
|
&base_key,
|
||||||
|
seeker,
|
||||||
|
),
|
||||||
|
Value::Array(array) => seek_leaf_values_in_array(
|
||||||
|
array,
|
||||||
|
selectors,
|
||||||
|
skip_selectors,
|
||||||
|
&base_key,
|
||||||
|
seeker,
|
||||||
|
),
|
||||||
|
value => seeker(&base_key, value),
|
||||||
|
}?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn seek_leaf_values_in_array(
|
||||||
|
values: &[Value],
|
||||||
|
selectors: Option<&[&str]>,
|
||||||
|
skip_selectors: &[&str],
|
||||||
|
base_key: &str,
|
||||||
|
seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
|
||||||
|
) -> Result<()> {
|
||||||
|
for value in values {
|
||||||
|
match value {
|
||||||
|
Value::Object(object) => {
|
||||||
|
seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker)
|
||||||
|
}
|
||||||
|
Value::Array(array) => {
|
||||||
|
seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker)
|
||||||
|
}
|
||||||
|
value => seeker(base_key, value),
|
||||||
|
}?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn select_field(
|
||||||
|
field_name: &str,
|
||||||
|
selectors: Option<&[&str]>,
|
||||||
|
skip_selectors: &[&str],
|
||||||
|
) -> bool {
|
||||||
|
selectors.map_or(true, |selectors| {
|
||||||
|
selectors.iter().any(|selector| {
|
||||||
|
contained_in(selector, &field_name) || contained_in(&field_name, selector)
|
||||||
|
})
|
||||||
|
}) && !skip_selectors.iter().any(|skip_selector| {
|
||||||
|
contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -19,6 +19,7 @@ impl SearchableExtractor for WordDocidsExtractor {
|
|||||||
index.exact_attributes(rtxn).map_err(Into::into)
|
index.exact_attributes(rtxn).map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// TODO write in an external Vec buffer
|
||||||
fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> {
|
fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> {
|
||||||
Cow::Borrowed(word.as_bytes())
|
Cow::Borrowed(word.as_bytes())
|
||||||
}
|
}
|
||||||
|
@ -1,22 +1,22 @@
|
|||||||
mod extract_word_docids;
|
mod extract_word_docids;
|
||||||
mod tokenize_document;
|
mod tokenize_document;
|
||||||
|
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::fs::File;
|
||||||
|
|
||||||
pub use extract_word_docids::{
|
pub use extract_word_docids::{
|
||||||
ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
|
ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
|
||||||
WordPositionDocidsExtractor,
|
WordPositionDocidsExtractor,
|
||||||
};
|
};
|
||||||
use std::borrow::Cow;
|
|
||||||
use std::fs::File;
|
|
||||||
|
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||||
|
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||||
|
|
||||||
use super::cache::CachedSorter;
|
use super::cache::CboCachedSorter;
|
||||||
use crate::update::new::{DocumentChange, ItemsPool};
|
use crate::update::new::{DocumentChange, ItemsPool};
|
||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||||
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
|
||||||
|
|
||||||
pub trait SearchableExtractor {
|
pub trait SearchableExtractor {
|
||||||
fn run_extraction(
|
fn run_extraction(
|
||||||
@ -60,7 +60,7 @@ pub trait SearchableExtractor {
|
|||||||
index.read_txn()?,
|
index.read_txn()?,
|
||||||
&document_tokenizer,
|
&document_tokenizer,
|
||||||
fields_ids_map.clone(),
|
fields_ids_map.clone(),
|
||||||
CachedSorter::new(
|
CboCachedSorter::new(
|
||||||
// TODO use a better value
|
// TODO use a better value
|
||||||
100.try_into().unwrap(),
|
100.try_into().unwrap(),
|
||||||
create_sorter(
|
create_sorter(
|
||||||
@ -103,14 +103,16 @@ pub trait SearchableExtractor {
|
|||||||
index: &Index,
|
index: &Index,
|
||||||
document_tokenizer: &DocumentTokenizer,
|
document_tokenizer: &DocumentTokenizer,
|
||||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||||
cached_sorter: &mut CachedSorter<MergeDeladdCboRoaringBitmaps>,
|
cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||||
document_change: DocumentChange,
|
document_change: DocumentChange,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
match document_change {
|
match document_change {
|
||||||
DocumentChange::Deletion(inner) => {
|
DocumentChange::Deletion(inner) => {
|
||||||
let mut token_fn = |fid, pos: u16, word: &str| {
|
let mut token_fn = |fid, pos: u16, word: &str| {
|
||||||
let key = Self::build_key(fid, pos, word);
|
let key = Self::build_key(fid, pos, word);
|
||||||
|
/// TODO manage the error
|
||||||
cached_sorter.insert_del_u32(&key, inner.docid()).unwrap();
|
cached_sorter.insert_del_u32(&key, inner.docid()).unwrap();
|
||||||
|
Ok(())
|
||||||
};
|
};
|
||||||
document_tokenizer.tokenize_document(
|
document_tokenizer.tokenize_document(
|
||||||
inner.current(rtxn, index)?.unwrap(),
|
inner.current(rtxn, index)?.unwrap(),
|
||||||
@ -121,7 +123,9 @@ pub trait SearchableExtractor {
|
|||||||
DocumentChange::Update(inner) => {
|
DocumentChange::Update(inner) => {
|
||||||
let mut token_fn = |fid, pos, word: &str| {
|
let mut token_fn = |fid, pos, word: &str| {
|
||||||
let key = Self::build_key(fid, pos, word);
|
let key = Self::build_key(fid, pos, word);
|
||||||
|
/// TODO manage the error
|
||||||
cached_sorter.insert_del_u32(&key, inner.docid()).unwrap();
|
cached_sorter.insert_del_u32(&key, inner.docid()).unwrap();
|
||||||
|
Ok(())
|
||||||
};
|
};
|
||||||
document_tokenizer.tokenize_document(
|
document_tokenizer.tokenize_document(
|
||||||
inner.current(rtxn, index)?.unwrap(),
|
inner.current(rtxn, index)?.unwrap(),
|
||||||
@ -131,14 +135,18 @@ pub trait SearchableExtractor {
|
|||||||
|
|
||||||
let mut token_fn = |fid, pos, word: &str| {
|
let mut token_fn = |fid, pos, word: &str| {
|
||||||
let key = Self::build_key(fid, pos, word);
|
let key = Self::build_key(fid, pos, word);
|
||||||
|
/// TODO manage the error
|
||||||
cached_sorter.insert_add_u32(&key, inner.docid()).unwrap();
|
cached_sorter.insert_add_u32(&key, inner.docid()).unwrap();
|
||||||
|
Ok(())
|
||||||
};
|
};
|
||||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||||
}
|
}
|
||||||
DocumentChange::Insertion(inner) => {
|
DocumentChange::Insertion(inner) => {
|
||||||
let mut token_fn = |fid, pos, word: &str| {
|
let mut token_fn = |fid, pos, word: &str| {
|
||||||
let key = Self::build_key(fid, pos, word);
|
let key = Self::build_key(fid, pos, word);
|
||||||
|
/// TODO manage the error
|
||||||
cached_sorter.insert_add_u32(&key, inner.docid()).unwrap();
|
cached_sorter.insert_add_u32(&key, inner.docid()).unwrap();
|
||||||
|
Ok(())
|
||||||
};
|
};
|
||||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||||
}
|
}
|
||||||
@ -152,5 +160,5 @@ pub trait SearchableExtractor {
|
|||||||
|
|
||||||
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
|
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
|
||||||
|
|
||||||
fn build_key<'a>(field_id: FieldId, position: u16, word: &'a str) -> Cow<'a, [u8]>;
|
fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>;
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,15 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||||
use heed::RoTxn;
|
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use crate::update::new::extract::perm_json_p::{
|
||||||
|
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
|
||||||
|
};
|
||||||
use crate::update::new::KvReaderFieldId;
|
use crate::update::new::KvReaderFieldId;
|
||||||
use crate::{
|
use crate::{
|
||||||
FieldId, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule,
|
FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
|
||||||
Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
MAX_WORD_LENGTH,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct DocumentTokenizer<'a> {
|
pub struct DocumentTokenizer<'a> {
|
||||||
@ -23,7 +25,7 @@ impl<'a> DocumentTokenizer<'a> {
|
|||||||
&self,
|
&self,
|
||||||
obkv: &KvReaderFieldId,
|
obkv: &KvReaderFieldId,
|
||||||
field_id_map: &mut GlobalFieldsIdsMap,
|
field_id_map: &mut GlobalFieldsIdsMap,
|
||||||
token_fn: &mut impl FnMut(FieldId, u16, &str),
|
token_fn: &mut impl FnMut(FieldId, u16, &str) -> Result<()>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut field_position = HashMap::new();
|
let mut field_position = HashMap::new();
|
||||||
let mut field_name = String::new();
|
let mut field_name = String::new();
|
||||||
@ -38,22 +40,23 @@ impl<'a> DocumentTokenizer<'a> {
|
|||||||
|
|
||||||
let mut tokenize_field = |name: &str, value: &Value| {
|
let mut tokenize_field = |name: &str, value: &Value| {
|
||||||
let Some(field_id) = field_id_map.id_or_insert(name) else {
|
let Some(field_id) = field_id_map.id_or_insert(name) else {
|
||||||
/// TODO: better error
|
return Err(UserError::AttributeLimitReached.into());
|
||||||
panic!("it's over 9000");
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let position =
|
let position =
|
||||||
field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0);
|
field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0);
|
||||||
if *position as u32 >= self.max_positions_per_attributes {
|
if *position as u32 >= self.max_positions_per_attributes {
|
||||||
return;
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
match value {
|
match value {
|
||||||
Value::Number(n) => {
|
Value::Number(n) => {
|
||||||
let token = n.to_string();
|
let token = n.to_string();
|
||||||
if let Ok(position) = (*position).try_into() {
|
if let Ok(position) = (*position).try_into() {
|
||||||
token_fn(field_id, position, token.as_str());
|
token_fn(field_id, position, token.as_str())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
Value::String(text) => {
|
Value::String(text) => {
|
||||||
// create an iterator of token with their positions.
|
// create an iterator of token with their positions.
|
||||||
@ -74,41 +77,40 @@ impl<'a> DocumentTokenizer<'a> {
|
|||||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||||
*position = index;
|
*position = index;
|
||||||
if let Ok(position) = (*position).try_into() {
|
if let Ok(position) = (*position).try_into() {
|
||||||
token_fn(field_id, position, token);
|
token_fn(field_id, position, token)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
_ => (),
|
_ => Ok(()),
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// if the current field is searchable or contains a searchable attribute
|
// if the current field is searchable or contains a searchable attribute
|
||||||
if perm_json_p::select_field(
|
if select_field(&field_name, self.attribute_to_extract, self.attribute_to_skip) {
|
||||||
&field_name,
|
|
||||||
self.attribute_to_extract.as_deref(),
|
|
||||||
self.attribute_to_skip,
|
|
||||||
) {
|
|
||||||
// parse json.
|
// parse json.
|
||||||
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
||||||
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
|
Value::Object(object) => seek_leaf_values_in_object(
|
||||||
&object,
|
&object,
|
||||||
self.attribute_to_extract.as_deref(),
|
self.attribute_to_extract,
|
||||||
self.attribute_to_skip,
|
self.attribute_to_skip,
|
||||||
&field_name,
|
&field_name,
|
||||||
&mut tokenize_field,
|
&mut tokenize_field,
|
||||||
),
|
)?,
|
||||||
Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
|
Value::Array(array) => seek_leaf_values_in_array(
|
||||||
&array,
|
&array,
|
||||||
self.attribute_to_extract.as_deref(),
|
self.attribute_to_extract,
|
||||||
self.attribute_to_skip,
|
self.attribute_to_skip,
|
||||||
&field_name,
|
&field_name,
|
||||||
&mut tokenize_field,
|
&mut tokenize_field,
|
||||||
),
|
)?,
|
||||||
value => tokenize_field(&field_name, &value),
|
value => tokenize_field(&field_name, &value)?,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -167,105 +169,6 @@ pub fn tokenizer_builder<'a>(
|
|||||||
tokenizer_builder
|
tokenizer_builder
|
||||||
}
|
}
|
||||||
|
|
||||||
/// TODO move in permissive json pointer
|
|
||||||
mod perm_json_p {
|
|
||||||
use serde_json::{Map, Value};
|
|
||||||
const SPLIT_SYMBOL: char = '.';
|
|
||||||
|
|
||||||
/// Returns `true` if the `selector` match the `key`.
|
|
||||||
///
|
|
||||||
/// ```text
|
|
||||||
/// Example:
|
|
||||||
/// `animaux` match `animaux`
|
|
||||||
/// `animaux.chien` match `animaux`
|
|
||||||
/// `animaux.chien` match `animaux`
|
|
||||||
/// `animaux.chien.nom` match `animaux`
|
|
||||||
/// `animaux.chien.nom` match `animaux.chien`
|
|
||||||
/// -----------------------------------------
|
|
||||||
/// `animaux` doesn't match `animaux.chien`
|
|
||||||
/// `animaux.` doesn't match `animaux`
|
|
||||||
/// `animaux.ch` doesn't match `animaux.chien`
|
|
||||||
/// `animau` doesn't match `animaux`
|
|
||||||
/// ```
|
|
||||||
pub fn contained_in(selector: &str, key: &str) -> bool {
|
|
||||||
selector.starts_with(key)
|
|
||||||
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn seek_leaf_values_in_object(
|
|
||||||
value: &Map<String, Value>,
|
|
||||||
selectors: Option<&[&str]>,
|
|
||||||
skip_selectors: &[&str],
|
|
||||||
base_key: &str,
|
|
||||||
seeker: &mut impl FnMut(&str, &Value),
|
|
||||||
) {
|
|
||||||
for (key, value) in value.iter() {
|
|
||||||
let base_key = if base_key.is_empty() {
|
|
||||||
key.to_string()
|
|
||||||
} else {
|
|
||||||
format!("{}{}{}", base_key, SPLIT_SYMBOL, key)
|
|
||||||
};
|
|
||||||
|
|
||||||
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
|
|
||||||
// so we check the contained_in on both side
|
|
||||||
let should_continue = select_field(&base_key, selectors, skip_selectors);
|
|
||||||
if should_continue {
|
|
||||||
match value {
|
|
||||||
Value::Object(object) => seek_leaf_values_in_object(
|
|
||||||
object,
|
|
||||||
selectors,
|
|
||||||
skip_selectors,
|
|
||||||
&base_key,
|
|
||||||
seeker,
|
|
||||||
),
|
|
||||||
Value::Array(array) => seek_leaf_values_in_array(
|
|
||||||
array,
|
|
||||||
selectors,
|
|
||||||
skip_selectors,
|
|
||||||
&base_key,
|
|
||||||
seeker,
|
|
||||||
),
|
|
||||||
value => seeker(&base_key, value),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn seek_leaf_values_in_array(
|
|
||||||
values: &[Value],
|
|
||||||
selectors: Option<&[&str]>,
|
|
||||||
skip_selectors: &[&str],
|
|
||||||
base_key: &str,
|
|
||||||
seeker: &mut impl FnMut(&str, &Value),
|
|
||||||
) {
|
|
||||||
for value in values {
|
|
||||||
match value {
|
|
||||||
Value::Object(object) => {
|
|
||||||
seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker)
|
|
||||||
}
|
|
||||||
Value::Array(array) => {
|
|
||||||
seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker)
|
|
||||||
}
|
|
||||||
value => seeker(base_key, value),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn select_field(
|
|
||||||
field_name: &str,
|
|
||||||
selectors: Option<&[&str]>,
|
|
||||||
skip_selectors: &[&str],
|
|
||||||
) -> bool {
|
|
||||||
selectors.map_or(true, |selectors| {
|
|
||||||
selectors.iter().any(|selector| {
|
|
||||||
contained_in(selector, &field_name) || contained_in(&field_name, selector)
|
|
||||||
})
|
|
||||||
}) && !skip_selectors.iter().any(|skip_selector| {
|
|
||||||
contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use charabia::TokenizerBuilder;
|
use charabia::TokenizerBuilder;
|
||||||
@ -274,6 +177,8 @@ mod test {
|
|||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use crate::FieldsIdsMap;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_tokenize_document() {
|
fn test_tokenize_document() {
|
||||||
let mut fields_ids_map = FieldsIdsMap::new();
|
let mut fields_ids_map = FieldsIdsMap::new();
|
||||||
@ -329,6 +234,7 @@ mod test {
|
|||||||
document_tokenizer
|
document_tokenizer
|
||||||
.tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| {
|
.tokenize_document(obkv, &mut global_fields_ids_map, &mut |fid, pos, word| {
|
||||||
words.insert([fid, pos], word.to_string());
|
words.insert([fid, pos], word.to_string());
|
||||||
|
Ok(())
|
||||||
})
|
})
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
@ -127,6 +127,19 @@ where
|
|||||||
&extractor_sender,
|
&extractor_sender,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
// TODO THIS IS TOO MUCH
|
||||||
|
// Extract fieldid docid facet number
|
||||||
|
// Extract fieldid docid facet string
|
||||||
|
// Extract facetid string fst
|
||||||
|
|
||||||
|
// Extract fieldid facet isempty docids
|
||||||
|
// Extract fieldid facet isnull docids
|
||||||
|
// Extract fieldid facet exists docids
|
||||||
|
|
||||||
|
// TODO This is the normal system
|
||||||
|
// Extract fieldid facet number docids
|
||||||
|
// Extract fieldid facet string docids
|
||||||
|
|
||||||
Ok(()) as Result<_>
|
Ok(()) as Result<_>
|
||||||
})
|
})
|
||||||
})?;
|
})?;
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
|
||||||
|
|
||||||
use fst::set::OpBuilder;
|
|
||||||
use fst::{Set, SetBuilder};
|
use fst::{Set, SetBuilder};
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
@ -15,7 +13,6 @@ use super::channel::{
|
|||||||
WordFidDocids, WordPositionDocids,
|
WordFidDocids, WordPositionDocids,
|
||||||
};
|
};
|
||||||
use super::KvReaderDelAdd;
|
use super::KvReaderDelAdd;
|
||||||
use crate::index::main_key::WORDS_FST_KEY;
|
|
||||||
use crate::update::del_add::DelAdd;
|
use crate::update::del_add::DelAdd;
|
||||||
use crate::update::new::channel::MergerOperation;
|
use crate::update::new::channel::MergerOperation;
|
||||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||||
@ -210,7 +207,7 @@ fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// TODO Return the slice directly from the serialize_into method
|
/// TODO Return the slice directly from the serialize_into method
|
||||||
fn serialize_bitmap_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec<u8>) {
|
fn serialize_bitmap_into_vec(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
bitmap.serialize_into(buffer).unwrap();
|
bitmap.serialize_into(buffer).unwrap();
|
||||||
// buffer.as_slice()
|
// buffer.as_slice()
|
||||||
|
Loading…
Reference in New Issue
Block a user