147 lines
4.8 KiB
Rust
Raw Normal View History

mod cache;
mod documents;
2024-09-04 17:03:09 +02:00
mod faceted;
2024-09-04 10:20:18 +02:00
mod searchable;
2024-10-21 10:39:40 +02:00
mod vectors;
2024-09-03 11:02:39 +02:00
2024-10-03 18:08:09 +02:00
use bumpalo::Bump;
pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap};
pub use documents::*;
pub use faceted::*;
pub use searchable::*;
2024-10-29 17:43:36 +01:00
pub use vectors::EmbeddingExtractor;
2024-09-04 17:03:09 +02:00
2024-11-04 15:10:40 +01:00
use super::indexer::document_changes::{
DocumentChanges, FullySend, IndexingContext, Progress, ThreadLocal,
};
2024-09-25 14:54:56 +02:00
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
2024-10-03 18:08:09 +02:00
use crate::Result;
2024-09-16 09:34:10 +02:00
pub trait DocidsExtractor {
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
2024-10-03 18:08:09 +02:00
grenad_parameters: GrenadParameters,
document_changes: &DC,
2024-11-04 15:10:40 +01:00
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
2024-11-04 15:10:40 +01:00
finished_steps: u16,
total_steps: u16,
step_name: &'static str,
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync;
2024-09-16 09:34:10 +02:00
}
2024-09-04 17:03:09 +02:00
/// TODO move in permissive json pointer
pub mod perm_json_p {
use serde_json::{Map, Value};
use crate::Result;
const SPLIT_SYMBOL: char = '.';
/// Returns `true` if the `selector` match the `key`.
///
/// ```text
/// Example:
/// `animaux` match `animaux`
/// `animaux.chien` match `animaux`
/// `animaux.chien` match `animaux`
/// `animaux.chien.nom` match `animaux`
/// `animaux.chien.nom` match `animaux.chien`
/// -----------------------------------------
/// `animaux` doesn't match `animaux.chien`
/// `animaux.` doesn't match `animaux`
/// `animaux.ch` doesn't match `animaux.chien`
/// `animau` doesn't match `animaux`
/// ```
pub fn contained_in(selector: &str, key: &str) -> bool {
selector.starts_with(key)
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
}
pub fn seek_leaf_values_in_object(
value: &Map<String, Value>,
selectors: Option<&[&str]>,
skip_selectors: &[&str],
base_key: &str,
seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
) -> Result<()> {
2024-09-16 09:34:10 +02:00
if value.is_empty() {
2024-09-30 16:08:29 +02:00
seeker(base_key, &Value::Object(Map::with_capacity(0)))?;
2024-09-16 09:34:10 +02:00
}
2024-09-04 17:03:09 +02:00
for (key, value) in value.iter() {
let base_key = if base_key.is_empty() {
key.to_string()
} else {
format!("{}{}{}", base_key, SPLIT_SYMBOL, key)
};
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
// so we check the contained_in on both side
let should_continue = select_field(&base_key, selectors, skip_selectors);
if should_continue {
match value {
Value::Object(object) => seek_leaf_values_in_object(
object,
selectors,
skip_selectors,
&base_key,
seeker,
),
Value::Array(array) => seek_leaf_values_in_array(
array,
selectors,
skip_selectors,
&base_key,
seeker,
),
value => seeker(&base_key, value),
}?;
}
}
Ok(())
}
pub fn seek_leaf_values_in_array(
values: &[Value],
selectors: Option<&[&str]>,
skip_selectors: &[&str],
base_key: &str,
seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
) -> Result<()> {
2024-09-16 09:34:10 +02:00
if values.is_empty() {
2024-09-30 16:08:29 +02:00
seeker(base_key, &Value::Array(vec![]))?;
2024-09-16 09:34:10 +02:00
}
2024-09-04 17:03:09 +02:00
for value in values {
match value {
Value::Object(object) => {
seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker)
}
Value::Array(array) => {
seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker)
}
value => seeker(base_key, value),
}?;
}
Ok(())
}
pub fn select_field(
field_name: &str,
selectors: Option<&[&str]>,
skip_selectors: &[&str],
) -> bool {
selectors.map_or(true, |selectors| {
selectors.iter().any(|selector| {
2024-09-30 16:08:29 +02:00
contained_in(selector, field_name) || contained_in(field_name, selector)
2024-09-04 17:03:09 +02:00
})
}) && !skip_selectors.iter().any(|skip_selector| {
2024-09-30 16:08:29 +02:00
contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector)
2024-09-04 17:03:09 +02:00
})
}
}