410 lines
16 KiB
Rust
Raw Normal View History

2024-10-03 18:08:09 +02:00
use std::cell::RefCell;
2024-10-10 22:42:37 +02:00
use std::sync::RwLock;
use std::thread::{self, Builder};
use big_s::S;
2024-10-03 18:08:09 +02:00
use bumpalo::Bump;
use document_changes::{
for_each_document_change, DocumentChanges, Extractor, FullySend, IndexingContext, ThreadLocal,
};
2024-09-02 14:42:27 +02:00
pub use document_deletion::DocumentDeletion;
pub use document_operation::DocumentOperation;
use heed::{RoTxn, RwTxn};
2024-09-02 14:42:27 +02:00
pub use partial_dump::PartialDump;
use rayon::ThreadPool;
use time::OffsetDateTime;
2024-09-02 14:42:27 +02:00
pub use update_by_function::UpdateByFunction;
use super::channel::*;
2024-10-03 18:08:09 +02:00
use super::document::write_to_obkv;
2024-10-10 22:42:37 +02:00
use super::document_change::DocumentChange;
use super::extract::*;
use super::merger::{merge_grenad_entries, FacetFieldIdsDelta};
use super::word_fst_builder::PrefixDelta;
use super::words_prefix_docids::{
compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids,
};
2024-10-10 22:42:37 +02:00
use super::{StdResult, TopLevelMap};
2024-09-12 15:38:31 +02:00
use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
use crate::facet::FacetType;
use crate::proximity::ProximityPrecision;
use crate::update::new::channel::ExtractorSender;
2024-10-03 18:08:09 +02:00
use crate::update::settings::InnerIndexSettings;
use crate::update::{FacetsUpdateBulk, GrenadParameters};
use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
2024-10-03 18:08:09 +02:00
mod de;
pub mod document_changes;
mod document_deletion;
mod document_operation;
mod partial_dump;
mod update_by_function;
2024-10-03 18:08:09 +02:00
struct DocumentExtractor<'a> {
document_sender: &'a DocumentSender<'a>,
}
impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> {
type Data = FullySend<()>;
fn init_data(
&self,
2024-10-10 22:42:37 +02:00
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
2024-10-03 18:08:09 +02:00
) -> Result<Self::Data> {
Ok(FullySend(()))
}
fn process(
&self,
change: DocumentChange,
context: &document_changes::DocumentChangeContext<Self::Data>,
) -> Result<()> {
let mut document_buffer = Vec::new();
let new_fields_ids_map = context.new_fields_ids_map.borrow();
let new_fields_ids_map = &*new_fields_ids_map;
let new_fields_ids_map = new_fields_ids_map.local_map();
let external_docid = change.external_docid().to_owned();
// document but we need to create a function that collects and compresses documents.
match change {
DocumentChange::Deletion(deletion) => {
let docid = deletion.docid();
self.document_sender.delete(docid, external_docid).unwrap();
}
/// TODO: change NONE by SOME(vector) when implemented
DocumentChange::Update(update) => {
let docid = update.docid();
let content =
update.new(&context.txn, context.index, &context.db_fields_ids_map)?;
let content =
write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
self.document_sender.insert(docid, external_docid, content.boxed()).unwrap();
}
DocumentChange::Insertion(insertion) => {
let docid = insertion.docid();
let content = insertion.new();
let content =
write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
self.document_sender.insert(docid, external_docid, content.boxed()).unwrap();
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
}
}
Ok(())
}
}
/// This is the main function of this crate.
///
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
///
/// TODO return stats
2024-10-03 18:08:09 +02:00
pub fn index<'pl, 'indexer, 'index, DC>(
wtxn: &mut RwTxn,
2024-10-03 18:08:09 +02:00
index: &'index Index,
db_fields_ids_map: &'indexer FieldsIdsMap,
new_fields_ids_map: FieldsIdsMap,
pool: &ThreadPool,
2024-10-03 18:08:09 +02:00
document_changes: &DC,
) -> Result<()>
where
2024-10-03 18:08:09 +02:00
DC: DocumentChanges<'pl>,
{
2024-09-05 15:12:07 +02:00
let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
// This channel acts as a rendezvous point to ensure that we are one task ahead
2024-09-11 10:20:23 +02:00
let (extractor_sender, merger_receiver) = extractors_merger_channels(4);
2024-10-03 18:08:09 +02:00
let new_fields_ids_map = RwLock::new(new_fields_ids_map);
let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads());
let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
let indexing_context = IndexingContext {
index,
db_fields_ids_map,
new_fields_ids_map: &new_fields_ids_map,
doc_allocs: &doc_allocs,
fields_ids_map_store: &fields_ids_map_store,
};
thread::scope(|s| {
let indexer_span = tracing::Span::current();
// TODO manage the errors correctly
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
pool.in_place_scope(|_s| {
let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
2024-09-05 17:36:19 +02:00
let _entered = span.enter();
// document but we need to create a function that collects and compresses documents.
2024-09-12 18:01:02 +02:00
let document_sender = extractor_sender.document_sender();
2024-10-03 18:08:09 +02:00
let document_extractor = DocumentExtractor { document_sender: &document_sender};
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?;
2024-09-12 18:01:02 +02:00
document_sender.finish().unwrap();
2024-09-05 15:12:07 +02:00
const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
let max_memory = TEN_GIB / dbg!(rayon::current_num_threads());
let grenad_parameters = GrenadParameters {
max_memory: Some(max_memory),
..GrenadParameters::default()
};
2024-09-16 09:34:10 +02:00
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
let _entered = span.enter();
extract_and_send_docids::<
2024-10-03 18:08:09 +02:00
_,
2024-09-16 09:34:10 +02:00
FacetedDocidsExtractor,
FacetDocids,
>(
grenad_parameters,
2024-10-03 18:08:09 +02:00
document_changes,
indexing_context,
&mut extractor_allocs,
2024-09-16 09:34:10 +02:00
&extractor_sender,
)?;
}
2024-09-05 17:36:19 +02:00
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
let _entered = span.enter();
2024-09-11 10:20:23 +02:00
let WordDocidsMergers {
word_fid_docids,
word_docids,
exact_word_docids,
word_position_docids,
fid_word_count_docids,
2024-10-03 18:08:09 +02:00
} = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?;
2024-09-11 10:20:23 +02:00
extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
extractor_sender.send_searchable::<WordPositionDocids>(word_position_docids).unwrap();
extractor_sender.send_searchable::<FidWordCountDocids>(fid_word_count_docids).unwrap();
2024-09-05 17:36:19 +02:00
}
// run the proximity extraction only if the precision is by word
// this works only if the settings didn't change during this transaction.
let rtxn = index.read_txn().unwrap();
let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default();
if proximity_precision == ProximityPrecision::ByWord {
2024-09-05 17:36:19 +02:00
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
let _entered = span.enter();
extract_and_send_docids::<
2024-10-03 18:08:09 +02:00
_,
2024-09-05 17:36:19 +02:00
WordPairProximityDocidsExtractor,
WordPairProximityDocids,
>(
grenad_parameters,
2024-10-03 18:08:09 +02:00
document_changes,
indexing_context,
&mut extractor_allocs,
2024-09-05 17:36:19 +02:00
&extractor_sender,
)?;
}
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH");
let _entered = span.enter();
}
// TODO THIS IS TOO MUCH
2024-09-16 09:34:10 +02:00
// - [ ] Extract fieldid docid facet number
// - [ ] Extract fieldid docid facet string
// - [ ] Extract facetid string fst
// - [ ] Extract facetid normalized string strings
// TODO Inverted Indexes again
2024-09-16 09:34:10 +02:00
// - [x] Extract fieldid facet isempty docids
// - [x] Extract fieldid facet isnull docids
// - [x] Extract fieldid facet exists docids
// TODO This is the normal system
2024-09-16 09:34:10 +02:00
// - [x] Extract fieldid facet number docids
// - [x] Extract fieldid facet string docids
Ok(()) as Result<_>
})
})?;
2024-10-03 18:08:09 +02:00
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
let indexer_span = tracing::Span::current();
// TODO manage the errors correctly
let merger_thread = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || {
2024-09-05 17:36:19 +02:00
let span =
tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "merge");
2024-09-05 17:36:19 +02:00
let _entered = span.enter();
let rtxn = index.read_txn().unwrap();
2024-09-12 18:01:02 +02:00
merge_grenad_entries(
merger_receiver,
merger_sender,
&rtxn,
index,
2024-10-03 18:08:09 +02:00
global_fields_ids_map,
2024-09-12 18:01:02 +02:00
)
})?;
for operation in writer_receiver {
let database = operation.database(index);
match operation.entry() {
EntryOperation::Delete(e) => {
if !database.delete(wtxn, e.entry())? {
unreachable!("We tried to delete an unknown key")
}
}
EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?,
}
}
2024-09-02 15:10:21 +02:00
/// TODO handle the panicking threads
handle.join().unwrap()?;
let merger_result = merger_thread.join().unwrap()?;
if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta {
compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
}
if let Some(prefix_delta) = merger_result.prefix_delta {
compute_prefix_database(index, wtxn, prefix_delta)?;
}
2024-09-02 15:10:21 +02:00
Ok(()) as Result<_>
})?;
2024-10-10 22:42:37 +02:00
// required to into_inner the new_fields_ids_map
2024-10-03 18:08:09 +02:00
drop(fields_ids_map_store);
let fields_ids_map = new_fields_ids_map.into_inner().unwrap();
index.put_fields_ids_map(wtxn, &fields_ids_map)?;
2024-09-30 11:35:03 +02:00
// used to update the localized and weighted maps while sharing the update code with the settings pipeline.
let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn)?;
inner_index_settings.recompute_facets(wtxn, index)?;
inner_index_settings.recompute_searchables(wtxn, index)?;
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
Ok(())
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
fn compute_prefix_database(
index: &Index,
wtxn: &mut RwTxn,
prefix_delta: PrefixDelta,
) -> Result<()> {
2024-10-03 10:40:31 +02:00
eprintln!("prefix_delta: {:?}", &prefix_delta);
let PrefixDelta { modified, deleted } = prefix_delta;
// Compute word prefix docids
compute_word_prefix_docids(wtxn, index, &modified, &deleted)?;
// Compute word prefix fid docids
compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted)?;
// Compute word prefix position docids
compute_word_prefix_position_docids(wtxn, index, &modified, &deleted)
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_field_ids")]
fn compute_facet_level_database(
index: &Index,
wtxn: &mut RwTxn,
facet_field_ids_delta: FacetFieldIdsDelta,
) -> Result<()> {
2024-10-03 10:40:31 +02:00
eprintln!("facet_field_ids_delta: {:?}", &facet_field_ids_delta);
if let Some(modified_facet_string_ids) = facet_field_ids_delta.modified_facet_string_ids() {
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string");
let _entered = span.enter();
FacetsUpdateBulk::new_not_updating_level_0(
index,
modified_facet_string_ids,
FacetType::String,
)
.execute(wtxn)?;
}
if let Some(modified_facet_number_ids) = facet_field_ids_delta.modified_facet_number_ids() {
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number");
let _entered = span.enter();
FacetsUpdateBulk::new_not_updating_level_0(
index,
modified_facet_number_ids,
FacetType::Number,
)
.execute(wtxn)?;
}
Ok(())
}
2024-09-04 12:17:13 +02:00
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
/// TODO: manage the errors correctly
/// TODO: we must have a single trait that also gives the extractor type
2024-10-03 18:08:09 +02:00
fn extract_and_send_docids<
'pl,
'fid,
'indexer,
'index,
DC: DocumentChanges<'pl>,
E: DocidsExtractor,
D: MergerOperationType,
>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
2024-09-04 12:17:13 +02:00
sender: &ExtractorSender,
) -> Result<()> {
2024-10-03 18:08:09 +02:00
let merger =
E::run_extraction(grenad_parameters, document_changes, indexing_context, extractor_allocs)?;
2024-09-30 16:08:29 +02:00
sender.send_searchable::<D>(merger).unwrap();
Ok(())
2024-09-04 12:17:13 +02:00
}
2024-09-11 15:59:30 +02:00
/// Returns the primary key *field id* that has already been set for this index or the
/// one we will guess by searching for the first key that contains "id" as a substring.
/// TODO move this elsewhere
2024-09-11 15:59:30 +02:00
pub fn retrieve_or_guess_primary_key<'a>(
rtxn: &'a RoTxn<'a>,
index: &Index,
2024-09-11 15:59:30 +02:00
fields_ids_map: &mut FieldsIdsMap,
first_document: Option<&'a TopLevelMap<'_>>,
) -> Result<StdResult<PrimaryKey<'a>, UserError>> {
match index.primary_key(rtxn)? {
2024-09-11 15:59:30 +02:00
Some(primary_key) => match PrimaryKey::new(primary_key, fields_ids_map) {
Some(primary_key) => Ok(Ok(primary_key)),
2024-09-11 15:59:30 +02:00
None => unreachable!("Why is the primary key not in the fidmap?"),
},
None => {
2024-09-11 15:59:30 +02:00
let first_document = match first_document {
Some(document) => document,
None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
};
let mut guesses: Vec<&str> = first_document
.keys()
.map(AsRef::as_ref)
.filter(|name| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
.collect();
2024-09-11 15:59:30 +02:00
// sort the keys in lexicographical order, so that fields are always in the same order.
guesses.sort_unstable();
match guesses.as_slice() {
[] => Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
2024-09-11 15:59:30 +02:00
[name] => {
tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
2024-09-11 15:59:30 +02:00
match fields_ids_map.insert(name) {
Some(field_id) => Ok(Ok(PrimaryKey::Flat { name, field_id })),
None => Ok(Err(UserError::AttributeLimitReached)),
}
}
multiple => Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
2024-09-11 15:59:30 +02:00
candidates: multiple.iter().map(|candidate| candidate.to_string()).collect(),
})),
}
}
}
}