2024-09-03 12:01:01 +02:00
|
|
|
use std::sync::RwLock;
|
2024-09-02 15:21:00 +02:00
|
|
|
use std::thread::{self, Builder};
|
2024-09-02 10:42:19 +02:00
|
|
|
|
|
|
|
use big_s::S;
|
2024-09-02 14:42:27 +02:00
|
|
|
pub use document_deletion::DocumentDeletion;
|
|
|
|
pub use document_operation::DocumentOperation;
|
2024-09-02 19:39:48 +02:00
|
|
|
use heed::{RoTxn, RwTxn};
|
2024-09-02 14:42:27 +02:00
|
|
|
pub use partial_dump::PartialDump;
|
2024-09-02 10:42:19 +02:00
|
|
|
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
|
|
|
use rayon::ThreadPool;
|
2024-09-10 21:10:28 +01:00
|
|
|
pub use top_level_map::{CowStr, TopLevelMap};
|
2024-09-02 14:42:27 +02:00
|
|
|
pub use update_by_function::UpdateByFunction;
|
2024-09-02 10:42:19 +02:00
|
|
|
|
2024-09-05 10:56:22 +02:00
|
|
|
use super::channel::*;
|
2024-09-02 10:42:19 +02:00
|
|
|
use super::document_change::DocumentChange;
|
2024-09-05 10:56:22 +02:00
|
|
|
use super::extract::*;
|
2024-09-02 10:42:19 +02:00
|
|
|
use super::merger::merge_grenad_entries;
|
2024-09-02 19:39:48 +02:00
|
|
|
use super::StdResult;
|
2024-09-12 15:38:31 +02:00
|
|
|
use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
|
2024-09-04 12:17:13 +02:00
|
|
|
use crate::update::new::channel::{DatabaseType, ExtractorSender};
|
2024-09-03 11:02:39 +02:00
|
|
|
use crate::update::GrenadParameters;
|
2024-09-12 15:38:31 +02:00
|
|
|
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
2024-09-02 10:42:19 +02:00
|
|
|
|
|
|
|
mod document_deletion;
|
|
|
|
mod document_operation;
|
|
|
|
mod partial_dump;
|
2024-09-10 18:01:17 +01:00
|
|
|
mod top_level_map;
|
2024-09-02 10:42:19 +02:00
|
|
|
mod update_by_function;
|
|
|
|
|
2024-09-02 15:10:21 +02:00
|
|
|
pub trait DocumentChanges<'p> {
|
2024-09-02 10:42:19 +02:00
|
|
|
type Parameter: 'p;
|
|
|
|
|
|
|
|
fn document_changes(
|
|
|
|
self,
|
2024-09-03 12:01:01 +02:00
|
|
|
fields_ids_map: &mut FieldsIdsMap,
|
2024-09-02 10:42:19 +02:00
|
|
|
param: Self::Parameter,
|
2024-09-02 19:39:48 +02:00
|
|
|
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p>;
|
2024-09-02 10:42:19 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/// This is the main function of this crate.
|
|
|
|
///
|
|
|
|
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
|
|
|
|
///
|
|
|
|
/// TODO return stats
|
|
|
|
pub fn index<PI>(
|
|
|
|
wtxn: &mut RwTxn,
|
|
|
|
index: &Index,
|
2024-09-03 12:01:01 +02:00
|
|
|
fields_ids_map: FieldsIdsMap,
|
2024-09-02 10:42:19 +02:00
|
|
|
pool: &ThreadPool,
|
2024-09-03 11:02:39 +02:00
|
|
|
document_changes: PI,
|
2024-09-02 10:42:19 +02:00
|
|
|
) -> Result<()>
|
|
|
|
where
|
2024-09-02 15:21:00 +02:00
|
|
|
PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send,
|
2024-09-02 10:42:19 +02:00
|
|
|
PI::Iter: Clone,
|
|
|
|
{
|
2024-09-05 15:12:07 +02:00
|
|
|
let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
|
2024-09-04 11:39:53 +02:00
|
|
|
// This channel acts as a rendezvous point to ensure that we are one task ahead
|
2024-09-11 10:20:23 +02:00
|
|
|
let (extractor_sender, merger_receiver) = extractors_merger_channels(4);
|
2024-09-02 10:42:19 +02:00
|
|
|
|
2024-09-03 12:01:01 +02:00
|
|
|
let fields_ids_map_lock = RwLock::new(fields_ids_map);
|
|
|
|
let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
|
2024-09-12 18:01:02 +02:00
|
|
|
let global_fields_ids_map_clone = global_fields_ids_map.clone();
|
2024-09-03 12:01:01 +02:00
|
|
|
|
2024-09-02 10:42:19 +02:00
|
|
|
thread::scope(|s| {
|
|
|
|
// TODO manage the errors correctly
|
2024-09-05 17:36:19 +02:00
|
|
|
let current_span = tracing::Span::current();
|
2024-09-03 16:08:33 +02:00
|
|
|
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
|
2024-09-02 15:21:00 +02:00
|
|
|
pool.in_place_scope(|_s| {
|
2024-09-05 17:36:19 +02:00
|
|
|
let span = tracing::trace_span!(target: "indexing::documents", parent: ¤t_span, "extract");
|
|
|
|
let _entered = span.enter();
|
2024-09-05 10:56:22 +02:00
|
|
|
let document_changes = document_changes.into_par_iter();
|
|
|
|
|
|
|
|
// document but we need to create a function that collects and compresses documents.
|
2024-09-12 18:01:02 +02:00
|
|
|
let document_sender = extractor_sender.document_sender();
|
2024-09-05 10:56:22 +02:00
|
|
|
document_changes.clone().into_par_iter().try_for_each(|result| {
|
|
|
|
match result? {
|
|
|
|
DocumentChange::Deletion(deletion) => {
|
|
|
|
let docid = deletion.docid();
|
2024-09-12 18:01:02 +02:00
|
|
|
document_sender.delete(docid).unwrap();
|
2024-09-05 10:56:22 +02:00
|
|
|
}
|
|
|
|
DocumentChange::Update(update) => {
|
|
|
|
let docid = update.docid();
|
|
|
|
let content = update.new();
|
2024-09-12 18:01:02 +02:00
|
|
|
document_sender.insert(docid, content.boxed()).unwrap();
|
2024-09-05 10:56:22 +02:00
|
|
|
}
|
|
|
|
DocumentChange::Insertion(insertion) => {
|
|
|
|
let docid = insertion.docid();
|
|
|
|
let content = insertion.new();
|
2024-09-12 18:01:02 +02:00
|
|
|
document_sender.insert(docid, content.boxed()).unwrap();
|
2024-09-05 10:56:22 +02:00
|
|
|
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
|
|
|
|
}
|
2024-09-04 09:59:19 +02:00
|
|
|
}
|
2024-09-05 10:56:22 +02:00
|
|
|
Ok(()) as Result<_>
|
|
|
|
})?;
|
|
|
|
|
2024-09-12 18:01:02 +02:00
|
|
|
document_sender.finish().unwrap();
|
|
|
|
|
2024-09-05 15:12:07 +02:00
|
|
|
const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
|
|
|
|
let max_memory = TEN_GIB / dbg!(rayon::current_num_threads());
|
|
|
|
let grenad_parameters = GrenadParameters {
|
|
|
|
max_memory: Some(max_memory),
|
|
|
|
..GrenadParameters::default()
|
|
|
|
};
|
2024-09-05 17:36:19 +02:00
|
|
|
{
|
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
|
|
|
let _entered = span.enter();
|
|
|
|
|
2024-09-11 10:20:23 +02:00
|
|
|
let WordDocidsMergers {
|
|
|
|
word_fid_docids,
|
|
|
|
word_docids,
|
|
|
|
exact_word_docids,
|
|
|
|
word_position_docids,
|
|
|
|
fid_word_count_docids,
|
|
|
|
} = WordDocidsExtractors::run_extraction(index, &global_fields_ids_map, grenad_parameters, document_changes.clone())?;
|
|
|
|
extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
|
|
|
|
extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
|
|
|
|
extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
|
|
|
|
extractor_sender.send_searchable::<WordPositionDocids>(word_position_docids).unwrap();
|
|
|
|
extractor_sender.send_searchable::<FidWordCountDocids>(fid_word_count_docids).unwrap();
|
2024-09-05 17:36:19 +02:00
|
|
|
}
|
2024-09-10 18:01:17 +01:00
|
|
|
|
2024-09-11 10:20:23 +02:00
|
|
|
// {
|
|
|
|
// let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids");
|
|
|
|
// let _entered = span.enter();
|
|
|
|
// extract_and_send_docids::<ExactWordDocidsExtractor, ExactWordDocids>(
|
|
|
|
// index,
|
|
|
|
// &global_fields_ids_map,
|
|
|
|
// grenad_parameters,
|
|
|
|
// document_changes.clone(),
|
|
|
|
// &extractor_sender,
|
|
|
|
// )?;
|
|
|
|
// }
|
|
|
|
|
|
|
|
// {
|
|
|
|
// let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids");
|
|
|
|
// let _entered = span.enter();
|
|
|
|
// extract_and_send_docids::<WordPositionDocidsExtractor, WordPositionDocids>(
|
|
|
|
// index,
|
|
|
|
// &global_fields_ids_map,
|
|
|
|
// grenad_parameters,
|
|
|
|
// document_changes.clone(),
|
|
|
|
// &extractor_sender,
|
|
|
|
// )?;
|
|
|
|
// }
|
|
|
|
|
|
|
|
// {
|
|
|
|
// let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids");
|
|
|
|
// let _entered = span.enter();
|
|
|
|
// extract_and_send_docids::<FidWordCountDocidsExtractor, FidWordCountDocids>(
|
|
|
|
// index,
|
|
|
|
// &global_fields_ids_map,
|
|
|
|
// GrenadParameters::default(),
|
|
|
|
// document_changes.clone(),
|
|
|
|
// &extractor_sender,
|
|
|
|
// )?;
|
|
|
|
// }
|
2024-09-05 17:36:19 +02:00
|
|
|
|
|
|
|
{
|
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
|
|
|
let _entered = span.enter();
|
|
|
|
extract_and_send_docids::<
|
|
|
|
WordPairProximityDocidsExtractor,
|
|
|
|
WordPairProximityDocids,
|
|
|
|
>(
|
|
|
|
index,
|
|
|
|
&global_fields_ids_map,
|
|
|
|
grenad_parameters,
|
|
|
|
document_changes.clone(),
|
|
|
|
&extractor_sender,
|
|
|
|
)?;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH");
|
|
|
|
let _entered = span.enter();
|
|
|
|
}
|
2024-09-05 10:56:22 +02:00
|
|
|
|
|
|
|
// TODO THIS IS TOO MUCH
|
|
|
|
// Extract fieldid docid facet number
|
|
|
|
// Extract fieldid docid facet string
|
|
|
|
// Extract facetid string fst
|
|
|
|
// Extract facetid normalized string strings
|
|
|
|
|
|
|
|
// TODO Inverted Indexes again
|
|
|
|
// Extract fieldid facet isempty docids
|
|
|
|
// Extract fieldid facet isnull docids
|
|
|
|
// Extract fieldid facet exists docids
|
|
|
|
|
|
|
|
// TODO This is the normal system
|
|
|
|
// Extract fieldid facet number docids
|
|
|
|
// Extract fieldid facet string docids
|
|
|
|
|
2024-09-04 09:59:19 +02:00
|
|
|
Ok(()) as Result<_>
|
2024-09-05 10:56:22 +02:00
|
|
|
})
|
2024-09-02 15:21:00 +02:00
|
|
|
})?;
|
2024-09-02 10:42:19 +02:00
|
|
|
|
|
|
|
// TODO manage the errors correctly
|
2024-09-05 17:36:19 +02:00
|
|
|
let current_span = tracing::Span::current();
|
2024-09-03 16:08:33 +02:00
|
|
|
let handle2 = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || {
|
2024-09-05 17:36:19 +02:00
|
|
|
let span =
|
|
|
|
tracing::trace_span!(target: "indexing::documents", parent: ¤t_span, "merge");
|
|
|
|
let _entered = span.enter();
|
2024-09-02 10:42:19 +02:00
|
|
|
let rtxn = index.read_txn().unwrap();
|
2024-09-12 18:01:02 +02:00
|
|
|
merge_grenad_entries(
|
|
|
|
merger_receiver,
|
|
|
|
merger_sender,
|
|
|
|
&rtxn,
|
|
|
|
index,
|
|
|
|
global_fields_ids_map_clone,
|
|
|
|
)
|
2024-09-02 10:42:19 +02:00
|
|
|
})?;
|
|
|
|
|
|
|
|
for operation in writer_receiver {
|
|
|
|
let database = operation.database(index);
|
2024-09-04 09:59:19 +02:00
|
|
|
match operation.entry() {
|
|
|
|
EntryOperation::Delete(e) => {
|
|
|
|
if !database.delete(wtxn, e.entry())? {
|
|
|
|
unreachable!("We tried to delete an unknown key")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?,
|
2024-09-02 10:42:19 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-02 15:10:21 +02:00
|
|
|
/// TODO handle the panicking threads
|
|
|
|
handle.join().unwrap()?;
|
|
|
|
handle2.join().unwrap()?;
|
|
|
|
|
2024-09-03 12:01:01 +02:00
|
|
|
Ok(()) as Result<_>
|
|
|
|
})?;
|
|
|
|
|
|
|
|
let fields_ids_map = fields_ids_map_lock.into_inner().unwrap();
|
|
|
|
index.put_fields_ids_map(wtxn, &fields_ids_map)?;
|
|
|
|
|
|
|
|
Ok(())
|
2024-09-02 10:42:19 +02:00
|
|
|
}
|
2024-09-02 19:39:48 +02:00
|
|
|
|
2024-09-04 12:17:13 +02:00
|
|
|
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
|
|
|
|
/// TODO: manage the errors correctly
|
|
|
|
/// TODO: we must have a single trait that also gives the extractor type
|
|
|
|
fn extract_and_send_docids<E: SearchableExtractor, D: DatabaseType>(
|
|
|
|
index: &Index,
|
|
|
|
fields_ids_map: &GlobalFieldsIdsMap,
|
|
|
|
indexer: GrenadParameters,
|
|
|
|
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
|
|
|
sender: &ExtractorSender,
|
|
|
|
) -> Result<()> {
|
|
|
|
let merger = E::run_extraction(index, fields_ids_map, indexer, document_changes)?;
|
|
|
|
Ok(sender.send_searchable::<D>(merger).unwrap())
|
|
|
|
}
|
|
|
|
|
2024-09-11 15:59:30 +02:00
|
|
|
/// Returns the primary key *field id* that has already been set for this index or the
|
|
|
|
/// one we will guess by searching for the first key that contains "id" as a substring.
|
2024-09-02 19:39:48 +02:00
|
|
|
/// TODO move this elsewhere
|
2024-09-11 15:59:30 +02:00
|
|
|
pub fn retrieve_or_guess_primary_key<'a>(
|
2024-09-02 19:39:48 +02:00
|
|
|
rtxn: &'a RoTxn<'a>,
|
|
|
|
index: &Index,
|
2024-09-11 15:59:30 +02:00
|
|
|
fields_ids_map: &mut FieldsIdsMap,
|
|
|
|
first_document: Option<&'a TopLevelMap<'_>>,
|
2024-09-02 19:39:48 +02:00
|
|
|
) -> Result<StdResult<PrimaryKey<'a>, UserError>> {
|
|
|
|
match index.primary_key(rtxn)? {
|
2024-09-11 15:59:30 +02:00
|
|
|
Some(primary_key) => match PrimaryKey::new(primary_key, fields_ids_map) {
|
2024-09-02 19:39:48 +02:00
|
|
|
Some(primary_key) => Ok(Ok(primary_key)),
|
2024-09-11 15:59:30 +02:00
|
|
|
None => unreachable!("Why is the primary key not in the fidmap?"),
|
2024-09-02 19:39:48 +02:00
|
|
|
},
|
|
|
|
None => {
|
2024-09-11 15:59:30 +02:00
|
|
|
let first_document = match first_document {
|
|
|
|
Some(document) => document,
|
|
|
|
None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut guesses: Vec<&str> = first_document
|
|
|
|
.keys()
|
|
|
|
.map(AsRef::as_ref)
|
|
|
|
.filter(|name| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
|
2024-09-02 19:39:48 +02:00
|
|
|
.collect();
|
|
|
|
|
2024-09-11 15:59:30 +02:00
|
|
|
// sort the keys in lexicographical order, so that fields are always in the same order.
|
|
|
|
guesses.sort_unstable();
|
2024-09-02 19:39:48 +02:00
|
|
|
|
|
|
|
match guesses.as_slice() {
|
|
|
|
[] => Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
2024-09-11 15:59:30 +02:00
|
|
|
[name] => {
|
2024-09-02 19:39:48 +02:00
|
|
|
tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
|
2024-09-11 15:59:30 +02:00
|
|
|
match fields_ids_map.insert(name) {
|
|
|
|
Some(field_id) => Ok(Ok(PrimaryKey::Flat { name, field_id })),
|
|
|
|
None => Ok(Err(UserError::AttributeLimitReached)),
|
|
|
|
}
|
2024-09-02 19:39:48 +02:00
|
|
|
}
|
|
|
|
multiple => Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
2024-09-11 15:59:30 +02:00
|
|
|
candidates: multiple.iter().map(|candidate| candidate.to_string()).collect(),
|
2024-09-02 19:39:48 +02:00
|
|
|
})),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|