301 lines
13 KiB
Rust
Raw Normal View History

use std::fs::File;
use std::sync::RwLock;
use std::thread::{self, Builder};
use big_s::S;
2024-09-02 14:42:27 +02:00
pub use document_deletion::DocumentDeletion;
pub use document_operation::DocumentOperation;
use heed::{RoTxn, RwTxn};
2024-09-02 14:42:27 +02:00
pub use partial_dump::PartialDump;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use rayon::ThreadPool;
2024-09-02 14:42:27 +02:00
pub use update_by_function::UpdateByFunction;
use super::channel::*;
use super::document_change::DocumentChange;
use super::extract::*;
use super::merger::merge_grenad_entries;
use super::StdResult;
use crate::documents::{
obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY,
};
2024-09-04 12:17:13 +02:00
use crate::update::new::channel::{DatabaseType, ExtractorSender};
2024-09-03 11:02:39 +02:00
use crate::update::GrenadParameters;
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
mod document_deletion;
mod document_operation;
mod partial_dump;
mod update_by_function;
2024-09-02 15:10:21 +02:00
pub trait DocumentChanges<'p> {
type Parameter: 'p;
fn document_changes(
self,
fields_ids_map: &mut FieldsIdsMap,
param: Self::Parameter,
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p>;
}
/// This is the main function of this crate.
///
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
///
/// TODO return stats
pub fn index<PI>(
wtxn: &mut RwTxn,
index: &Index,
fields_ids_map: FieldsIdsMap,
pool: &ThreadPool,
2024-09-03 11:02:39 +02:00
document_changes: PI,
) -> Result<()>
where
PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send,
PI::Iter: Clone,
{
2024-09-05 15:12:07 +02:00
let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
// This channel acts as a rendezvous point to ensure that we are one task ahead
let (extractor_sender, merger_receiver) = extractors_merger_channels(0);
let fields_ids_map_lock = RwLock::new(fields_ids_map);
let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
thread::scope(|s| {
// TODO manage the errors correctly
2024-09-05 17:36:19 +02:00
let current_span = tracing::Span::current();
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
pool.in_place_scope(|_s| {
2024-09-05 17:36:19 +02:00
let span = tracing::trace_span!(target: "indexing::documents", parent: &current_span, "extract");
let _entered = span.enter();
let document_changes = document_changes.into_par_iter();
// document but we need to create a function that collects and compresses documents.
document_changes.clone().into_par_iter().try_for_each(|result| {
match result? {
DocumentChange::Deletion(deletion) => {
let docid = deletion.docid();
extractor_sender.document_delete(docid).unwrap();
}
DocumentChange::Update(update) => {
let docid = update.docid();
let content = update.new();
extractor_sender.document_insert(docid, content.boxed()).unwrap();
}
DocumentChange::Insertion(insertion) => {
let docid = insertion.docid();
let content = insertion.new();
extractor_sender.document_insert(docid, content.boxed()).unwrap();
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
}
}
Ok(()) as Result<_>
})?;
2024-09-05 15:12:07 +02:00
const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
let max_memory = TEN_GIB / dbg!(rayon::current_num_threads());
let grenad_parameters = GrenadParameters {
max_memory: Some(max_memory),
..GrenadParameters::default()
};
2024-09-05 17:36:19 +02:00
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
let _entered = span.enter();
extract_and_send_docids::<WordDocidsExtractor, WordDocids>(
index,
&global_fields_ids_map,
grenad_parameters,
document_changes.clone(),
&extractor_sender,
)?;
}
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_fid_docids");
let _entered = span.enter();
extract_and_send_docids::<WordFidDocidsExtractor, WordFidDocids>(
index,
&global_fields_ids_map,
grenad_parameters,
document_changes.clone(),
&extractor_sender,
)?;
}
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids");
let _entered = span.enter();
extract_and_send_docids::<ExactWordDocidsExtractor, ExactWordDocids>(
index,
&global_fields_ids_map,
grenad_parameters,
document_changes.clone(),
&extractor_sender,
)?;
}
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids");
let _entered = span.enter();
extract_and_send_docids::<WordPositionDocidsExtractor, WordPositionDocids>(
index,
&global_fields_ids_map,
grenad_parameters,
document_changes.clone(),
&extractor_sender,
)?;
}
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids");
let _entered = span.enter();
extract_and_send_docids::<FidWordCountDocidsExtractor, FidWordCountDocids>(
index,
&global_fields_ids_map,
GrenadParameters::default(),
document_changes.clone(),
&extractor_sender,
)?;
}
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
let _entered = span.enter();
extract_and_send_docids::<
WordPairProximityDocidsExtractor,
WordPairProximityDocids,
>(
index,
&global_fields_ids_map,
grenad_parameters,
document_changes.clone(),
&extractor_sender,
)?;
}
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH");
let _entered = span.enter();
}
// TODO THIS IS TOO MUCH
// Extract fieldid docid facet number
// Extract fieldid docid facet string
// Extract facetid string fst
// Extract facetid normalized string strings
// TODO Inverted Indexes again
// Extract fieldid facet isempty docids
// Extract fieldid facet isnull docids
// Extract fieldid facet exists docids
// TODO This is the normal system
// Extract fieldid facet number docids
// Extract fieldid facet string docids
Ok(()) as Result<_>
})
})?;
// TODO manage the errors correctly
2024-09-05 17:36:19 +02:00
let current_span = tracing::Span::current();
let handle2 = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || {
2024-09-05 17:36:19 +02:00
let span =
tracing::trace_span!(target: "indexing::documents", parent: &current_span, "merge");
let _entered = span.enter();
let rtxn = index.read_txn().unwrap();
2024-09-02 15:10:21 +02:00
merge_grenad_entries(merger_receiver, merger_sender, &rtxn, index)
})?;
for operation in writer_receiver {
let database = operation.database(index);
match operation.entry() {
EntryOperation::Delete(e) => {
if !database.delete(wtxn, e.entry())? {
unreachable!("We tried to delete an unknown key")
}
}
EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?,
}
}
2024-09-02 15:10:21 +02:00
/// TODO handle the panicking threads
handle.join().unwrap()?;
handle2.join().unwrap()?;
Ok(()) as Result<_>
})?;
let fields_ids_map = fields_ids_map_lock.into_inner().unwrap();
index.put_fields_ids_map(wtxn, &fields_ids_map)?;
Ok(())
}
2024-09-04 12:17:13 +02:00
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
/// TODO: manage the errors correctly
/// TODO: we must have a single trait that also gives the extractor type
fn extract_and_send_docids<E: SearchableExtractor, D: DatabaseType>(
index: &Index,
fields_ids_map: &GlobalFieldsIdsMap,
indexer: GrenadParameters,
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
sender: &ExtractorSender,
) -> Result<()> {
let merger = E::run_extraction(index, fields_ids_map, indexer, document_changes)?;
Ok(sender.send_searchable::<D>(merger).unwrap())
}
/// TODO move this elsewhere
pub fn guess_primary_key<'a>(
rtxn: &'a RoTxn<'a>,
index: &Index,
mut cursor: DocumentsBatchCursor<File>,
documents_batch_index: &'a DocumentsBatchIndex,
) -> Result<StdResult<PrimaryKey<'a>, UserError>> {
// The primary key *field id* that has already been set for this index or the one
// we will guess by searching for the first key that contains "id" as a substring.
match index.primary_key(rtxn)? {
Some(primary_key) => match PrimaryKey::new(primary_key, documents_batch_index) {
Some(primary_key) => Ok(Ok(primary_key)),
None => match cursor.next_document()? {
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
primary_key: primary_key.to_string(),
document: obkv_to_object(first_document, documents_batch_index)?,
})),
None => unreachable!("Called with reader.is_empty()"),
},
},
None => {
let mut guesses: Vec<(u16, &str)> = documents_batch_index
.iter()
.filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
.map(|(field_id, name)| (*field_id, name.as_str()))
.collect();
// sort the keys in a deterministic, obvious way, so that fields are always in the same order.
guesses.sort_by(|(_, left_name), (_, right_name)| {
// shortest name first
left_name.len().cmp(&right_name.len()).then_with(
// then alphabetical order
|| left_name.cmp(right_name),
)
});
match guesses.as_slice() {
[] => Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
[(field_id, name)] => {
tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
Ok(Ok(PrimaryKey::Flat { name, field_id: *field_id }))
}
multiple => Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
candidates: multiple
.iter()
.map(|(_, candidate)| candidate.to_string())
.collect(),
})),
}
}
}
}