Integrate first searchable exctrator

This commit is contained in:
ManyTheFish 2024-09-03 11:02:39 +02:00
parent 5369bf4a62
commit c50d3edc4a
9 changed files with 419 additions and 181 deletions

View File

@ -1,10 +1,12 @@
use std::fs::File; use std::fs::File;
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
use grenad::Merger;
use heed::types::Bytes; use heed::types::Bytes;
use super::StdResult; use super::StdResult;
use crate::update::new::KvReaderFieldId; use crate::update::new::KvReaderFieldId;
use crate::update::MergeDeladdCboRoaringBitmaps;
use crate::{DocumentId, Index}; use crate::{DocumentId, Index};
/// The capacity of the channel is currently in number of messages. /// The capacity of the channel is currently in number of messages.
@ -159,7 +161,7 @@ impl DocumentSender {
} }
pub enum MergerOperation { pub enum MergerOperation {
WordDocidsCursors(Vec<grenad::ReaderCursor<File>>), WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
} }
pub struct MergerReceiver(Receiver<MergerOperation>); pub struct MergerReceiver(Receiver<MergerOperation>);
@ -175,3 +177,16 @@ impl IntoIterator for MergerReceiver {
#[derive(Clone)] #[derive(Clone)]
pub struct DeladdCboRoaringBitmapSender(Sender<MergerOperation>); pub struct DeladdCboRoaringBitmapSender(Sender<MergerOperation>);
impl DeladdCboRoaringBitmapSender {
pub fn word_docids(
&self,
merger: Merger<File, MergeDeladdCboRoaringBitmaps>,
) -> StdResult<(), SendError<()>> {
let operation = MergerOperation::WordDocidsMerger(merger);
match self.0.send(operation) {
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}
}
}

View File

@ -2,7 +2,7 @@ use heed::RoTxn;
use obkv::KvReader; use obkv::KvReader;
use crate::update::new::KvReaderFieldId; use crate::update::new::KvReaderFieldId;
use crate::{DocumentId, FieldId}; use crate::{DocumentId, FieldId, Index};
pub enum DocumentChange { pub enum DocumentChange {
Deletion(Deletion), Deletion(Deletion),
@ -13,13 +13,13 @@ pub enum DocumentChange {
pub struct Deletion { pub struct Deletion {
docid: DocumentId, docid: DocumentId,
external_docid: String, // ? external_docid: String, // ?
current: Box<KvReaderFieldId>, current: Box<KvReaderFieldId>, // ?
} }
pub struct Update { pub struct Update {
docid: DocumentId, docid: DocumentId,
external_docid: String, // ? external_docid: String, // ?
current: Box<KvReaderFieldId>, current: Box<KvReaderFieldId>, // ?
new: Box<KvReaderFieldId>, new: Box<KvReaderFieldId>,
} }
@ -30,7 +30,7 @@ pub struct Insertion {
} }
impl DocumentChange { impl DocumentChange {
fn docid(&self) -> DocumentId { pub fn docid(&self) -> DocumentId {
match &self { match &self {
Self::Deletion(inner) => inner.docid(), Self::Deletion(inner) => inner.docid(),
Self::Update(inner) => inner.docid(), Self::Update(inner) => inner.docid(),
@ -48,11 +48,11 @@ impl Deletion {
Self { docid, external_docid, current } Self { docid, external_docid, current }
} }
fn docid(&self) -> DocumentId { pub fn docid(&self) -> DocumentId {
self.docid self.docid
} }
fn current(&self, rtxn: &RoTxn) -> &KvReader<FieldId> { pub fn current(&self, rtxn: &RoTxn, index: &Index) -> &KvReader<FieldId> {
unimplemented!() unimplemented!()
} }
} }
@ -62,11 +62,11 @@ impl Insertion {
Insertion { docid, external_docid, new } Insertion { docid, external_docid, new }
} }
fn docid(&self) -> DocumentId { pub fn docid(&self) -> DocumentId {
self.docid self.docid
} }
fn new(&self) -> &KvReader<FieldId> { pub fn new(&self) -> &KvReader<FieldId> {
unimplemented!() unimplemented!()
} }
} }
@ -81,15 +81,15 @@ impl Update {
Update { docid, external_docid, current, new } Update { docid, external_docid, current, new }
} }
fn docid(&self) -> DocumentId { pub fn docid(&self) -> DocumentId {
self.docid self.docid
} }
fn current(&self, rtxn: &RoTxn) -> &KvReader<FieldId> { pub fn current(&self, rtxn: &RoTxn, index: &Index) -> &KvReader<FieldId> {
unimplemented!() unimplemented!()
} }
fn new(&self) -> &KvReader<FieldId> { pub fn new(&self) -> &KvReader<FieldId> {
unimplemented!() unimplemented!()
} }
} }

View File

@ -2,12 +2,12 @@ use std::borrow::Cow;
use std::num::NonZeroUsize; use std::num::NonZeroUsize;
use std::{io, mem}; use std::{io, mem};
use grenad2::{MergeFunction, Sorter}; use grenad::{MergeFunction, Sorter};
use lru::LruCache; use lru::LruCache;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use smallvec::SmallVec; use smallvec::SmallVec;
use crate::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
#[derive(Debug)] #[derive(Debug)]
pub struct CachedSorter<MF> { pub struct CachedSorter<MF> {

View File

@ -1,84 +1,180 @@
pub fn extract_word_docids( use std::fs::File;
use charabia::TokenizerBuilder;
use grenad::Merger;
use grenad::ReaderCursor;
use heed::RoTxn;
use rayon::iter::IntoParallelIterator;
use rayon::iter::ParallelBridge;
use rayon::iter::ParallelIterator;
use crate::update::MergeDeladdCboRoaringBitmaps;
use crate::{
update::{
create_sorter,
new::{DocumentChange, ItemsPool},
GrenadParameters,
},
FieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE,
};
use super::{
cache::{CachedSorter, DelAddRoaringBitmapMerger},
tokenize_document::DocumentTokenizer,
};
pub trait SearchableExtractor {
fn run_extraction(
index: &Index,
fields_ids_map: &FieldsIdsMap,
indexer: GrenadParameters,
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
let max_memory = indexer.max_memory_by_thread();
let rtxn = index.read_txn()?;
let stop_words = index.stop_words(&rtxn)?;
let allowed_separators = index.allowed_separators(&rtxn)?;
let allowed_separators: Option<Vec<_>> =
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
let dictionary = index.dictionary(&rtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let builder = tokenizer_builder(
stop_words.as_ref(),
allowed_separators.as_deref(),
dictionary.as_deref(),
);
let tokenizer = builder.into_tokenizer();
let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn)?;
let localized_attributes_rules =
index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
let document_tokenizer = DocumentTokenizer {
tokenizer: &tokenizer,
searchable_attributes: user_defined_searchable_fields.as_deref(),
localized_attributes_rules: &localized_attributes_rules,
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
};
let context_pool = ItemsPool::new(|| {
Ok((
index.read_txn()?,
&document_tokenizer,
CachedSorter::new(
// TODO use a better value
100.try_into().unwrap(),
create_sorter(
grenad::SortAlgorithm::Stable,
DelAddRoaringBitmapMerger,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
),
),
))
});
document_changes.into_par_iter().try_for_each(|document_change| {
context_pool.with(|(rtxn, document_tokenizer, cached_sorter)| {
Self::extract_document_change(
&*rtxn,
index,
document_tokenizer,
&fields_ids_map,
cached_sorter,
document_change?,
)
})
})?;
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
for (_rtxn, _tokenizer, cache) in context_pool.into_items() {
let sorter = cache.into_sorter()?;
let readers = sorter.into_reader_cursors()?;
builder.extend(readers);
}
Ok(builder.build())
}
fn extract_document_change(
rtxn: &RoTxn,
index: &Index,
document_tokenizer: &DocumentTokenizer,
fields_ids_map: &FieldsIdsMap,
cached_sorter: &mut CachedSorter<DelAddRoaringBitmapMerger>,
document_change: DocumentChange, document_change: DocumentChange,
_tokenizer: &Tokenizer, ) -> Result<()>;
output: &mut CachedSorter<DelAddRoaringBitmapMerger>, }
) -> grenad::Result<(), io::Error> {
pub struct WordDocidsExtractor;
impl SearchableExtractor for WordDocidsExtractor {
fn extract_document_change(
rtxn: &RoTxn,
index: &Index,
document_tokenizer: &DocumentTokenizer,
fields_ids_map: &FieldsIdsMap,
// TODO: DelAddRoaringBitmapMerger should be CBO
cached_sorter: &mut CachedSorter<DelAddRoaringBitmapMerger>,
document_change: DocumentChange,
) -> crate::Result<()> {
match document_change { match document_change {
DocumentChange::Deletion(inner) => { DocumentChange::Deletion(inner) => {
unimplemented!() let mut token_fn = |_fid, _pos: u16, word: &str| {
cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap();
};
document_tokenizer.tokenize_document(
inner.current(rtxn, index),
fields_ids_map,
&mut token_fn,
)?;
} }
DocumentChange::Update(inner) => { DocumentChange::Update(inner) => {
unimplemented!() let mut token_fn = |_fid, _pos, word: &str| {
cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap();
};
document_tokenizer.tokenize_document(
inner.current(rtxn, index),
fields_ids_map,
&mut token_fn,
)?;
let mut token_fn = |_fid, _pos, word: &str| {
cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap();
};
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
} }
DocumentChange::Insertion(inner) => { DocumentChange::Insertion(inner) => {
unimplemented!() let mut token_fn = |_fid, _pos, word: &str| {
} cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap();
} };
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
let normalizer_options = NormalizerOption::default();
if let Some(previous_doc) = previous_doc {
for (_, v) in previous_doc.iter() {
// Only manage the direct JSON strings
// TODO manage the JSON strings correctly (escaped chars)
if v.first().zip(v.last()) == Some((&b'"', &b'"')) {
let s = std::str::from_utf8(&v[1..v.len() - 1]).unwrap();
// for token in tokenizer.tokenize(s).filter(|t| t.is_word()) {
// let key = token.lemma().normalize(&normalizer_options);
for token in s.split_whitespace() {
let key = token.normalize(&normalizer_options);
output.insert_del_u32(key.as_bytes(), docid)?;
}
}
}
}
for (_, v) in new_doc.iter() {
// Only manage the direct JSON strings
// TODO manage the JSON strings correctly (escaped chars)
if v.first().zip(v.last()) == Some((&b'"', &b'"')) {
let s = std::str::from_utf8(&v[1..v.len() - 1]).unwrap();
// for token in tokenizer.tokenize(s).filter(|t| t.is_word()) {
// let key = token.lemma().normalize(&normalizer_options);
for token in s.split_whitespace() {
let key = token.normalize(&normalizer_options);
output.insert_add_u32(key.as_bytes(), docid)?;
}
} }
} }
Ok(()) Ok(())
}
} }
/// take an iterator on tokens and compute their relative position depending on separator kinds /// Factorize tokenizer building.
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, fn tokenizer_builder<'a>(
/// else we keep the standard proximity of 1 between words. stop_words: Option<&'a fst::Set<&'a [u8]>>,
fn process_tokens<'a>( allowed_separators: Option<&'a [&str]>,
tokens: impl Iterator<Item = Token<'a>>, dictionary: Option<&'a [&str]>,
) -> impl Iterator<Item = (usize, Token<'a>)> { ) -> TokenizerBuilder<'a, &'a [u8]> {
tokens let mut tokenizer_builder = TokenizerBuilder::new();
.skip_while(|token| token.is_separator()) if let Some(stop_words) = stop_words {
.scan((0, None), |(offset, prev_kind), mut token| { tokenizer_builder.stop_words(stop_words);
match token.kind {
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
*offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1,
None => 0,
};
*prev_kind = Some(token.kind)
} }
TokenKind::Separator(SeparatorKind::Hard) => { if let Some(dictionary) = dictionary {
*prev_kind = Some(token.kind); tokenizer_builder.words_dict(dictionary);
} }
TokenKind::Separator(SeparatorKind::Soft) if let Some(separators) = allowed_separators {
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => tokenizer_builder.separators(separators);
{
*prev_kind = Some(token.kind);
} }
_ => token.kind = TokenKind::Unknown,
} tokenizer_builder
Some((*offset, token))
})
.filter(|(_, t)| t.is_word())
} }

View File

@ -1,2 +1,6 @@
mod cache; mod cache;
mod extract_word_docids; mod extract_word_docids;
mod tokenize_document;
pub use extract_word_docids::SearchableExtractor;
pub use extract_word_docids::WordDocidsExtractor;

View File

@ -1,56 +1,71 @@
pub struct DocumentTokenizer { use crate::{
tokenizer: &Tokenizer, update::new::KvReaderFieldId, FieldId, FieldsIdsMap, Index, InternalError,
searchable_attributes: Option<&[String]>, LocalizedAttributesRule, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
localized_attributes_rules: &[LocalizedAttributesRule], };
max_positions_per_attributes: u32, use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
use heed::RoTxn;
use serde_json::Value;
use std::collections::HashMap;
pub struct DocumentTokenizer<'a> {
pub tokenizer: &'a Tokenizer<'a>,
pub searchable_attributes: Option<&'a [&'a str]>,
pub localized_attributes_rules: &'a [LocalizedAttributesRule],
pub max_positions_per_attributes: u32,
} }
impl DocumentTokenizer { impl<'a> DocumentTokenizer<'a> {
// pub fn new(tokenizer: &Tokenizer, settings: &InnerIndexSettings) -> Self { pub fn tokenize_document(
// Self { tokenizer, settings } &self,
// } obkv: &KvReaderFieldId,
pub fn tokenize_document<'a>(
obkv: &KvReader<'a, FieldId>,
field_id_map: &FieldsIdsMap, field_id_map: &FieldsIdsMap,
token_fn: impl Fn(FieldId, u16, &str), token_fn: &mut impl FnMut(FieldId, u16, &str),
) { ) -> Result<()> {
let mut field_position = Hashmap::new(); let mut field_position = HashMap::new();
for (field_id, field_bytes) in obkv { for (field_id, field_bytes) in obkv {
let field_name = field_id_map.name(field_id); let Some(field_name) = field_id_map.name(field_id) else {
unreachable!("field id not found in field id map");
};
let mut tokenize_field = |name: &str, value: &Value| {
let Some(field_id) = field_id_map.id(name) else {
unreachable!("field name not found in field id map");
};
let position =
field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0);
if *position as u32 >= self.max_positions_per_attributes {
return;
}
let tokenize_field = |name, value| {
let field_id = field_id_map.id(name);
match value { match value {
Number(n) => { Value::Number(n) => {
let token = n.to_string(); let token = n.to_string();
let position = field_position if let Ok(position) = (*position).try_into() {
.entry(field_id)
.and_modify(|counter| *counter += 8)
.or_insert(0);
token_fn(field_id, position, token.as_str()); token_fn(field_id, position, token.as_str());
} }
String(text) => { }
Value::String(text) => {
// create an iterator of token with their positions. // create an iterator of token with their positions.
let locales = self let locales = self
.localized_attributes_rules .localized_attributes_rules
.iter() .iter()
.first(|rule| rule.match_str(field_name)) .find(|rule| rule.match_str(field_name))
.map(|rule| rule.locales(field_id)); .map(|rule| rule.locales());
let tokens = let tokens = process_tokens(
process_tokens(tokenizer.tokenize_with_allow_list(field, locales)) *position,
.take_while(|(p, _)| { self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
(*p as u32) < self.max_positions_per_attributes )
}); .take_while(|(p, _)| (*p as u32) < self.max_positions_per_attributes);
for (index, token) in tokens { for (index, token) in tokens {
// keep a word only if it is not empty and fit in a LMDB key. // keep a word only if it is not empty and fit in a LMDB key.
let token = token.lemma().trim(); let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
let position: u16 = index *position = index;
.try_into() if let Ok(position) = (*position).try_into() {
.map_err(|_| SerializationError::InvalidNumberSerialization)?; token_fn(field_id, position, token);
writer.insert(position, token.as_bytes())?; }
} }
} }
} }
@ -59,21 +74,28 @@ impl DocumentTokenizer {
}; };
// if the current field is searchable or contains a searchable attribute // if the current field is searchable or contains a searchable attribute
if searchable_attributes.map_or(true, |attributes| { if self.searchable_attributes.map_or(true, |attributes| {
attributes.iter().any(|name| contained_in(name, field_name)) attributes.iter().any(|name| perm_json_p::contained_in(name, field_name))
}) { }) {
// parse json. // parse json.
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
Value::Object(object) => { Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
seek_leaf_values_in_object(object, selectors, &field_name, tokenize_field) &object,
} self.searchable_attributes.as_deref(),
Value::Array(array) => { &field_name,
seek_leaf_values_in_array(array, selectors, &field_name, tokenize_field) &mut tokenize_field,
} ),
value => tokenize_field(&base_key, value), Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
&array,
self.searchable_attributes.as_deref(),
&field_name,
&mut tokenize_field,
),
value => tokenize_field(&field_name, &value),
} }
} }
} }
Ok(())
} }
} }
@ -81,11 +103,12 @@ impl DocumentTokenizer {
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, /// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
/// else we keep the standard proximity of 1 between words. /// else we keep the standard proximity of 1 between words.
fn process_tokens<'a>( fn process_tokens<'a>(
start_offset: usize,
tokens: impl Iterator<Item = Token<'a>>, tokens: impl Iterator<Item = Token<'a>>,
) -> impl Iterator<Item = (usize, Token<'a>)> { ) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens tokens
.skip_while(|token| token.is_separator()) .skip_while(|token| token.is_separator())
.scan((0, None), |(offset, prev_kind), mut token| { .scan((start_offset, None), |(offset, prev_kind), mut token| {
match token.kind { match token.kind {
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
*offset += match *prev_kind { *offset += match *prev_kind {
@ -110,42 +133,45 @@ fn process_tokens<'a>(
.filter(|(_, t)| t.is_word()) .filter(|(_, t)| t.is_word())
} }
/// Returns `true` if the `selector` match the `key`.
///
/// ```text
/// Example:
/// `animaux` match `animaux`
/// `animaux.chien` match `animaux`
/// `animaux.chien` match `animaux`
/// `animaux.chien.nom` match `animaux`
/// `animaux.chien.nom` match `animaux.chien`
/// -----------------------------------------
/// `animaux` doesn't match `animaux.chien`
/// `animaux.` doesn't match `animaux`
/// `animaux.ch` doesn't match `animaux.chien`
/// `animau` doesn't match `animaux`
/// ```
fn contained_in(selector: &str, key: &str) -> bool {
selector.starts_with(key)
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
}
/// TODO move in permissive json pointer /// TODO move in permissive json pointer
mod perm_json_p { mod perm_json_p {
use serde_json::{Map, Value};
const SPLIT_SYMBOL: char = '.';
/// Returns `true` if the `selector` match the `key`.
///
/// ```text
/// Example:
/// `animaux` match `animaux`
/// `animaux.chien` match `animaux`
/// `animaux.chien` match `animaux`
/// `animaux.chien.nom` match `animaux`
/// `animaux.chien.nom` match `animaux.chien`
/// -----------------------------------------
/// `animaux` doesn't match `animaux.chien`
/// `animaux.` doesn't match `animaux`
/// `animaux.ch` doesn't match `animaux.chien`
/// `animau` doesn't match `animaux`
/// ```
pub fn contained_in(selector: &str, key: &str) -> bool {
selector.starts_with(key)
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
}
pub fn seek_leaf_values<'a>( pub fn seek_leaf_values<'a>(
value: &Map<String, Value>, value: &Map<String, Value>,
selectors: impl IntoIterator<Item = &'a str>, selectors: impl IntoIterator<Item = &'a str>,
seeker: impl Fn(&str, &Value), seeker: &mut impl FnMut(&str, &Value),
) { ) {
let selectors: Vec<_> = selectors.into_iter().collect(); let selectors: Vec<_> = selectors.into_iter().collect();
seek_leaf_values_in_object(value, &selectors, "", &seeker); seek_leaf_values_in_object(value, Some(&selectors), "", seeker);
} }
pub fn seek_leaf_values_in_object( pub fn seek_leaf_values_in_object(
value: &Map<String, Value>, value: &Map<String, Value>,
selectors: &[&str], selectors: Option<&[&str]>,
base_key: &str, base_key: &str,
seeker: &impl Fn(&str, &Value), seeker: &mut impl FnMut(&str, &Value),
) { ) {
for (key, value) in value.iter() { for (key, value) in value.iter() {
let base_key = if base_key.is_empty() { let base_key = if base_key.is_empty() {
@ -156,8 +182,10 @@ mod perm_json_p {
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo` // here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
// so we check the contained_in on both side // so we check the contained_in on both side
let should_continue = selectors.iter().any(|selector| { let should_continue = selectors.map_or(true, |selectors| {
selectors.iter().any(|selector| {
contained_in(selector, &base_key) || contained_in(&base_key, selector) contained_in(selector, &base_key) || contained_in(&base_key, selector)
})
}); });
if should_continue { if should_continue {
@ -175,12 +203,12 @@ mod perm_json_p {
} }
pub fn seek_leaf_values_in_array( pub fn seek_leaf_values_in_array(
values: &mut [Value], values: &[Value],
selectors: &[&str], selectors: Option<&[&str]>,
base_key: &str, base_key: &str,
seeker: &impl Fn(&str, &Value), seeker: &mut impl FnMut(&str, &Value),
) { ) {
for value in values.iter_mut() { for value in values {
match value { match value {
Value::Object(object) => { Value::Object(object) => {
seek_leaf_values_in_object(object, selectors, base_key, seeker) seek_leaf_values_in_object(object, selectors, base_key, seeker)
@ -193,3 +221,91 @@ mod perm_json_p {
} }
} }
} }
#[cfg(test)]
mod test {
use super::*;
use charabia::TokenizerBuilder;
use meili_snap::snapshot;
use obkv::KvReader;
use serde_json::json;
#[test]
fn test_tokenize_document() {
let mut fields_ids_map = FieldsIdsMap::new();
let field_1 = json!({
"name": "doggo",
"age": 10,
});
let field_2 = json!({
"catto": {
"name": "pesti",
"age": 23,
}
});
let field_3 = json!(["doggo", "catto"]);
let mut obkv = obkv::KvWriter::memory();
let field_1_id = fields_ids_map.insert("doggo").unwrap();
let field_1 = serde_json::to_string(&field_1).unwrap();
obkv.insert(field_1_id, field_1.as_bytes()).unwrap();
let field_2_id = fields_ids_map.insert("catto").unwrap();
let field_2 = serde_json::to_string(&field_2).unwrap();
obkv.insert(field_2_id, field_2.as_bytes()).unwrap();
let field_3_id = fields_ids_map.insert("doggo.name").unwrap();
let field_3 = serde_json::to_string(&field_3).unwrap();
obkv.insert(field_3_id, field_3.as_bytes()).unwrap();
let value = obkv.into_inner().unwrap();
let obkv = KvReader::from_slice(value.as_slice());
fields_ids_map.insert("doggo.age");
fields_ids_map.insert("catto.catto.name");
fields_ids_map.insert("catto.catto.age");
let mut tb = TokenizerBuilder::default();
let document_tokenizer = DocumentTokenizer {
tokenizer: &tb.build(),
searchable_attributes: None,
localized_attributes_rules: &[],
max_positions_per_attributes: 1000,
};
let mut words = std::collections::BTreeMap::new();
document_tokenizer
.tokenize_document(obkv, &fields_ids_map, &mut |fid, pos, word| {
words.insert([fid, pos], word.to_string());
})
.unwrap();
snapshot!(format!("{:#?}", words), @r###"
{
[
2,
0,
]: "doggo",
[
2,
8,
]: "doggo",
[
2,
16,
]: "catto",
[
3,
0,
]: "10",
[
4,
0,
]: "pesti",
[
5,
0,
]: "23",
}
"###);
}
}

View File

@ -15,11 +15,13 @@ use super::channel::{
WriterOperation, WriterOperation,
}; };
use super::document_change::DocumentChange; use super::document_change::DocumentChange;
use super::extract::{SearchableExtractor, WordDocidsExtractor};
use super::merger::merge_grenad_entries; use super::merger::merge_grenad_entries;
use super::StdResult; use super::StdResult;
use crate::documents::{ use crate::documents::{
obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY, obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY,
}; };
use crate::update::GrenadParameters;
use crate::{Index, Result, UserError}; use crate::{Index, Result, UserError};
mod document_deletion; mod document_deletion;
@ -45,7 +47,7 @@ pub fn index<PI>(
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
index: &Index, index: &Index,
pool: &ThreadPool, pool: &ThreadPool,
_document_changes: PI, document_changes: PI,
) -> Result<()> ) -> Result<()>
where where
PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send, PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send,
@ -59,10 +61,18 @@ where
// TODO manage the errors correctly // TODO manage the errors correctly
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || {
pool.in_place_scope(|_s| { pool.in_place_scope(|_s| {
let document_changes = document_changes.into_par_iter();
// word docids // word docids
// document_changes.into_par_iter().try_for_each(|_dc| Ok(()) as Result<_>) let merger = WordDocidsExtractor::run_extraction(
// let grenads = extractor_function(document_changes)?; index,
// deladd_cbo_roaring_bitmap_sender.word_docids(grenads)?; todo!(),
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
GrenadParameters::default(),
document_changes.clone(),
)?;
/// TODO: manage the errors correctly
deladd_cbo_roaring_bitmap_sender.word_docids(merger).unwrap();
Ok(()) as Result<_> Ok(()) as Result<_>
}) })

View File

@ -20,14 +20,12 @@ pub fn merge_grenad_entries(
for merger_operation in receiver { for merger_operation in receiver {
match merger_operation { match merger_operation {
MergerOperation::WordDocidsCursors(cursors) => { MergerOperation::WordDocidsMerger(merger) => {
let sender = sender.word_docids(); let sender = sender.word_docids();
let database = index.word_docids.remap_types::<Bytes, Bytes>(); let database = index.word_docids.remap_types::<Bytes, Bytes>();
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
builder.extend(cursors);
/// TODO manage the error correctly /// TODO manage the error correctly
let mut merger_iter = builder.build().into_stream_merger_iter().unwrap(); let mut merger_iter = merger.into_stream_merger_iter().unwrap();
// TODO manage the error correctly // TODO manage the error correctly
while let Some((key, deladd)) = merger_iter.next().unwrap() { while let Some((key, deladd)) = merger_iter.next().unwrap() {

View File

@ -4,13 +4,12 @@ pub use items_pool::ItemsPool;
use super::del_add::DelAdd; use super::del_add::DelAdd;
use crate::FieldId; use crate::FieldId;
mod document_change;
mod merger;
// mod extract;
mod channel; mod channel;
//mod global_fields_ids_map; mod document_change;
mod extract;
pub mod indexer; pub mod indexer;
mod items_pool; mod items_pool;
mod merger;
/// TODO move them elsewhere /// TODO move them elsewhere
pub type StdResult<T, E> = std::result::Result<T, E>; pub type StdResult<T, E> = std::result::Result<T, E>;