2024-10-09 11:35:45 +02:00
|
|
|
use std::cmp::Ordering;
|
2024-10-10 22:42:37 +02:00
|
|
|
use std::sync::RwLock;
|
2024-09-02 15:21:00 +02:00
|
|
|
use std::thread::{self, Builder};
|
2024-09-02 10:42:19 +02:00
|
|
|
|
|
|
|
use big_s::S;
|
2024-10-03 18:08:09 +02:00
|
|
|
use document_changes::{
|
2024-10-09 11:35:45 +02:00
|
|
|
for_each_document_change, DocumentChanges, FullySend, IndexingContext, ThreadLocal,
|
2024-10-03 18:08:09 +02:00
|
|
|
};
|
2024-09-02 14:42:27 +02:00
|
|
|
pub use document_deletion::DocumentDeletion;
|
|
|
|
pub use document_operation::DocumentOperation;
|
2024-10-09 11:35:45 +02:00
|
|
|
use heed::types::{Bytes, DecodeIgnore, Str};
|
2024-09-02 19:39:48 +02:00
|
|
|
use heed::{RoTxn, RwTxn};
|
2024-10-09 11:35:45 +02:00
|
|
|
use itertools::{merge_join_by, EitherOrBoth};
|
2024-09-02 14:42:27 +02:00
|
|
|
pub use partial_dump::PartialDump;
|
2024-09-02 10:42:19 +02:00
|
|
|
use rayon::ThreadPool;
|
2024-10-09 13:53:34 +02:00
|
|
|
use time::OffsetDateTime;
|
2024-09-02 14:42:27 +02:00
|
|
|
pub use update_by_function::UpdateByFunction;
|
2024-09-02 10:42:19 +02:00
|
|
|
|
2024-09-05 10:56:22 +02:00
|
|
|
use super::channel::*;
|
|
|
|
use super::extract::*;
|
2024-10-09 11:35:45 +02:00
|
|
|
use super::merger::{FacetDatabases, FacetFieldIdsDelta};
|
2024-10-01 09:56:49 +02:00
|
|
|
use super::word_fst_builder::PrefixDelta;
|
|
|
|
use super::words_prefix_docids::{
|
|
|
|
compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids,
|
|
|
|
};
|
2024-10-10 22:42:37 +02:00
|
|
|
use super::{StdResult, TopLevelMap};
|
2024-09-12 15:38:31 +02:00
|
|
|
use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
|
2024-10-01 16:13:08 +02:00
|
|
|
use crate::facet::FacetType;
|
2024-10-09 11:35:45 +02:00
|
|
|
use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
|
2024-10-09 11:35:45 +02:00
|
|
|
use crate::proximity::ProximityPrecision;
|
2024-10-09 11:35:45 +02:00
|
|
|
use crate::update::del_add::DelAdd;
|
|
|
|
use crate::update::new::word_fst_builder::{PrefixData, WordFstBuilder};
|
2024-10-14 11:12:10 +02:00
|
|
|
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
|
2024-10-09 11:35:45 +02:00
|
|
|
use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids};
|
2024-10-03 18:08:09 +02:00
|
|
|
use crate::update::settings::InnerIndexSettings;
|
2024-10-01 16:13:08 +02:00
|
|
|
use crate::update::{FacetsUpdateBulk, GrenadParameters};
|
2024-10-09 11:35:45 +02:00
|
|
|
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
2024-09-02 10:42:19 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
pub mod de;
|
2024-10-03 18:08:09 +02:00
|
|
|
pub mod document_changes;
|
2024-09-02 10:42:19 +02:00
|
|
|
mod document_deletion;
|
|
|
|
mod document_operation;
|
|
|
|
mod partial_dump;
|
|
|
|
mod update_by_function;
|
|
|
|
|
|
|
|
/// This is the main function of this crate.
|
|
|
|
///
|
|
|
|
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
|
|
|
|
///
|
|
|
|
/// TODO return stats
|
2024-10-03 18:08:09 +02:00
|
|
|
pub fn index<'pl, 'indexer, 'index, DC>(
|
2024-09-02 10:42:19 +02:00
|
|
|
wtxn: &mut RwTxn,
|
2024-10-03 18:08:09 +02:00
|
|
|
index: &'index Index,
|
|
|
|
db_fields_ids_map: &'indexer FieldsIdsMap,
|
|
|
|
new_fields_ids_map: FieldsIdsMap,
|
2024-10-16 09:27:00 +02:00
|
|
|
new_primary_key: Option<PrimaryKey<'pl>>,
|
2024-09-02 10:42:19 +02:00
|
|
|
pool: &ThreadPool,
|
2024-10-03 18:08:09 +02:00
|
|
|
document_changes: &DC,
|
2024-09-02 10:42:19 +02:00
|
|
|
) -> Result<()>
|
|
|
|
where
|
2024-10-03 18:08:09 +02:00
|
|
|
DC: DocumentChanges<'pl>,
|
2024-09-02 10:42:19 +02:00
|
|
|
{
|
2024-10-09 11:35:45 +02:00
|
|
|
// TODO find a better channel limit
|
|
|
|
let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000);
|
2024-10-03 18:08:09 +02:00
|
|
|
let new_fields_ids_map = RwLock::new(new_fields_ids_map);
|
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
|
2024-10-03 18:08:09 +02:00
|
|
|
let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads());
|
|
|
|
let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
|
|
|
|
let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
|
|
|
|
|
|
|
|
let indexing_context = IndexingContext {
|
|
|
|
index,
|
|
|
|
db_fields_ids_map,
|
|
|
|
new_fields_ids_map: &new_fields_ids_map,
|
|
|
|
doc_allocs: &doc_allocs,
|
|
|
|
fields_ids_map_store: &fields_ids_map_store,
|
|
|
|
};
|
2024-09-03 12:01:01 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
thread::scope(|s| -> crate::Result<_> {
|
2024-10-01 16:13:08 +02:00
|
|
|
let indexer_span = tracing::Span::current();
|
2024-09-02 10:42:19 +02:00
|
|
|
// TODO manage the errors correctly
|
2024-10-09 11:35:45 +02:00
|
|
|
let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
|
2024-09-02 15:21:00 +02:00
|
|
|
pool.in_place_scope(|_s| {
|
2024-10-09 11:35:45 +02:00
|
|
|
let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
|
|
|
|
let _entered = span.enter();
|
|
|
|
|
|
|
|
// document but we need to create a function that collects and compresses documents.
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let document_sender = extractor_sender.documents();
|
|
|
|
let document_extractor = DocumentsExtractor::new(&document_sender);
|
|
|
|
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
|
|
|
|
for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?;
|
|
|
|
|
|
|
|
let mut documents_ids = index.documents_ids(&rtxn)?;
|
|
|
|
let delta_documents_ids = datastore.into_iter().map(|FullySend(d)| d.into_inner()).reduce(DelAddRoaringBitmap::merge).unwrap_or_default();
|
|
|
|
delta_documents_ids.apply_to(&mut documents_ids);
|
|
|
|
extractor_sender.send_documents_ids(documents_ids).unwrap();
|
|
|
|
|
|
|
|
// document_sender.finish().unwrap();
|
|
|
|
|
|
|
|
const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
|
|
|
|
let current_num_threads = rayon::current_num_threads();
|
|
|
|
let max_memory = TEN_GIB / current_num_threads;
|
|
|
|
eprintln!("A maximum of {max_memory} bytes will be used for each of the {current_num_threads} threads");
|
|
|
|
let grenad_parameters = GrenadParameters {
|
|
|
|
max_memory: Some(max_memory),
|
|
|
|
..GrenadParameters::default()
|
|
|
|
};
|
|
|
|
|
|
|
|
let facet_field_ids_delta;
|
|
|
|
|
|
|
|
{
|
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
|
2024-09-05 17:36:19 +02:00
|
|
|
let _entered = span.enter();
|
2024-10-09 11:35:45 +02:00
|
|
|
facet_field_ids_delta = merge_and_send_facet_docids(
|
|
|
|
global_fields_ids_map,
|
|
|
|
FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?,
|
|
|
|
FacetDatabases::new(index),
|
|
|
|
index,
|
|
|
|
extractor_sender.facet_docids(),
|
|
|
|
)?;
|
|
|
|
}
|
2024-09-05 10:56:22 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
{
|
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
|
|
|
let _entered = span.enter();
|
2024-09-12 18:01:02 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
let WordDocidsCaches {
|
|
|
|
word_docids,
|
|
|
|
word_fid_docids,
|
|
|
|
exact_word_docids,
|
|
|
|
word_position_docids,
|
|
|
|
fid_word_count_docids,
|
|
|
|
} = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?;
|
2024-09-16 09:34:10 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
// TODO Word Docids Merger
|
|
|
|
// extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
|
2024-09-16 09:34:10 +02:00
|
|
|
{
|
2024-10-09 11:35:45 +02:00
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
|
2024-09-16 09:34:10 +02:00
|
|
|
let _entered = span.enter();
|
2024-10-09 11:35:45 +02:00
|
|
|
merge_and_send_docids(
|
|
|
|
word_docids,
|
|
|
|
index.word_docids.remap_types(),
|
|
|
|
index,
|
|
|
|
extractor_sender.docids::<WordDocids>(),
|
2024-09-16 09:34:10 +02:00
|
|
|
)?;
|
|
|
|
}
|
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
// Word Fid Docids Merging
|
|
|
|
// extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
|
2024-09-05 17:36:19 +02:00
|
|
|
{
|
2024-10-09 11:35:45 +02:00
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
|
2024-09-05 17:36:19 +02:00
|
|
|
let _entered = span.enter();
|
2024-10-09 11:35:45 +02:00
|
|
|
merge_and_send_docids(
|
2024-09-11 10:20:23 +02:00
|
|
|
word_fid_docids,
|
2024-10-09 11:35:45 +02:00
|
|
|
index.word_fid_docids.remap_types(),
|
|
|
|
index,
|
|
|
|
extractor_sender.docids::<WordFidDocids>()
|
|
|
|
)?;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Exact Word Docids Merging
|
|
|
|
// extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
|
|
|
|
{
|
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
|
|
|
|
let _entered = span.enter();
|
|
|
|
merge_and_send_docids(
|
2024-09-11 10:20:23 +02:00
|
|
|
exact_word_docids,
|
2024-10-09 11:35:45 +02:00
|
|
|
index.exact_word_docids.remap_types(),
|
|
|
|
index,
|
|
|
|
extractor_sender.docids::<ExactWordDocids>(),
|
|
|
|
)?;
|
2024-09-05 17:36:19 +02:00
|
|
|
}
|
2024-09-10 18:01:17 +01:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
// Word Position Docids Merging
|
|
|
|
// extractor_sender.send_searchable::<WordPositionDocids>(word_position_docids).unwrap();
|
|
|
|
{
|
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
|
2024-09-05 17:36:19 +02:00
|
|
|
let _entered = span.enter();
|
2024-10-09 11:35:45 +02:00
|
|
|
merge_and_send_docids(
|
|
|
|
word_position_docids,
|
|
|
|
index.word_position_docids.remap_types(),
|
|
|
|
index,
|
|
|
|
extractor_sender.docids::<WordPositionDocids>(),
|
2024-09-05 17:36:19 +02:00
|
|
|
)?;
|
|
|
|
}
|
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
// Fid Word Count Docids Merging
|
|
|
|
// extractor_sender.send_searchable::<FidWordCountDocids>(fid_word_count_docids).unwrap();
|
2024-09-05 17:36:19 +02:00
|
|
|
{
|
2024-10-09 11:35:45 +02:00
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
|
2024-09-05 17:36:19 +02:00
|
|
|
let _entered = span.enter();
|
2024-10-09 11:35:45 +02:00
|
|
|
merge_and_send_docids(
|
|
|
|
fid_word_count_docids,
|
|
|
|
index.field_id_word_count_docids.remap_types(),
|
|
|
|
index,
|
|
|
|
extractor_sender.docids::<FidWordCountDocids>(),
|
|
|
|
)?;
|
2024-09-05 17:36:19 +02:00
|
|
|
}
|
2024-10-09 11:35:45 +02:00
|
|
|
}
|
2024-09-05 10:56:22 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
// run the proximity extraction only if the precision is by word
|
|
|
|
// this works only if the settings didn't change during this transaction.
|
|
|
|
let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default();
|
|
|
|
if proximity_precision == ProximityPrecision::ByWord {
|
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
|
|
|
let _entered = span.enter();
|
|
|
|
let caches = <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?;
|
|
|
|
merge_and_send_docids(
|
|
|
|
caches,
|
|
|
|
index.word_pair_proximity_docids.remap_types(),
|
|
|
|
index,
|
|
|
|
extractor_sender.docids::<WordPairProximityDocids>(),
|
|
|
|
)?;
|
|
|
|
}
|
2024-09-05 10:56:22 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
{
|
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH");
|
|
|
|
let _entered = span.enter();
|
|
|
|
}
|
2024-09-05 10:56:22 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
// TODO THIS IS TOO MUCH
|
|
|
|
// - [ ] Extract fieldid docid facet number
|
|
|
|
// - [ ] Extract fieldid docid facet string
|
|
|
|
// - [ ] Extract facetid string fst
|
|
|
|
// - [ ] Extract facetid normalized string strings
|
2024-09-05 10:56:22 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
// TODO Inverted Indexes again
|
|
|
|
// - [x] Extract fieldid facet isempty docids
|
|
|
|
// - [x] Extract fieldid facet isnull docids
|
|
|
|
// - [x] Extract fieldid facet exists docids
|
|
|
|
|
|
|
|
// TODO This is the normal system
|
|
|
|
// - [x] Extract fieldid facet number docids
|
|
|
|
// - [x] Extract fieldid facet string docids
|
|
|
|
|
|
|
|
// TODO use None when needed
|
|
|
|
Result::Ok(facet_field_ids_delta)
|
|
|
|
})
|
2024-09-02 15:21:00 +02:00
|
|
|
})?;
|
2024-09-02 10:42:19 +02:00
|
|
|
|
2024-10-03 18:08:09 +02:00
|
|
|
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
|
2024-10-01 16:13:08 +02:00
|
|
|
let indexer_span = tracing::Span::current();
|
2024-09-02 10:42:19 +02:00
|
|
|
|
|
|
|
for operation in writer_receiver {
|
|
|
|
let database = operation.database(index);
|
2024-09-04 09:59:19 +02:00
|
|
|
match operation.entry() {
|
|
|
|
EntryOperation::Delete(e) => {
|
|
|
|
if !database.delete(wtxn, e.entry())? {
|
|
|
|
unreachable!("We tried to delete an unknown key")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?,
|
2024-09-02 10:42:19 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-02 15:10:21 +02:00
|
|
|
/// TODO handle the panicking threads
|
2024-10-09 11:35:45 +02:00
|
|
|
let facet_field_ids_delta = extractor_handle.join().unwrap()?;
|
|
|
|
|
|
|
|
let prefix_delta = {
|
|
|
|
let rtxn = index.read_txn()?;
|
|
|
|
let words_fst = index.words_fst(&rtxn)?;
|
|
|
|
let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;
|
|
|
|
let prefix_settings = index.prefix_settings(&rtxn)?;
|
|
|
|
word_fst_builder.with_prefix_settings(prefix_settings);
|
|
|
|
|
2024-11-05 11:26:46 +01:00
|
|
|
let previous_words = index.word_docids.iter(&rtxn)?.remap_data_type::<Bytes>();
|
|
|
|
let current_words = index.word_docids.iter(wtxn)?.remap_data_type::<Bytes>();
|
2024-10-09 11:35:45 +02:00
|
|
|
for eob in merge_join_by(previous_words, current_words, |lhs, rhs| match (lhs, rhs) {
|
|
|
|
(Ok((l, _)), Ok((r, _))) => l.cmp(r),
|
|
|
|
(Err(_), _) | (_, Err(_)) => Ordering::Equal,
|
|
|
|
}) {
|
|
|
|
match eob {
|
|
|
|
EitherOrBoth::Both(lhs, rhs) => {
|
2024-11-05 11:26:46 +01:00
|
|
|
let (word, lhs_bytes) = lhs?;
|
|
|
|
let (_, rhs_bytes) = rhs?;
|
|
|
|
if lhs_bytes != rhs_bytes {
|
|
|
|
word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?;
|
2024-10-09 11:35:45 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
EitherOrBoth::Left(result) => {
|
|
|
|
let (word, _) = result?;
|
|
|
|
word_fst_builder.register_word(DelAdd::Deletion, word.as_ref())?;
|
|
|
|
}
|
|
|
|
EitherOrBoth::Right(result) => {
|
|
|
|
let (word, _) = result?;
|
|
|
|
word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-10-01 09:56:49 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "words_fst");
|
|
|
|
let _entered = span.enter();
|
|
|
|
|
|
|
|
let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?;
|
|
|
|
// extractor_sender.main().write_words_fst(word_fst_mmap).unwrap();
|
|
|
|
index.main.remap_types::<Str, Bytes>().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?;
|
|
|
|
if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data {
|
|
|
|
// extractor_sender.main().write_words_prefixes_fst(prefixes_fst_mmap).unwrap();
|
|
|
|
index.main.remap_types::<Str, Bytes>().put(
|
|
|
|
wtxn,
|
|
|
|
WORDS_PREFIXES_FST_KEY,
|
|
|
|
&prefixes_fst_mmap,
|
|
|
|
)?;
|
|
|
|
Some(prefix_delta)
|
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
|
|
|
};
|
2024-10-01 09:56:49 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
// if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta {
|
|
|
|
// compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
|
|
|
|
// }
|
|
|
|
|
|
|
|
if let Some(prefix_delta) = prefix_delta {
|
2024-10-01 16:13:08 +02:00
|
|
|
compute_prefix_database(index, wtxn, prefix_delta)?;
|
2024-10-01 09:56:49 +02:00
|
|
|
}
|
2024-09-02 15:10:21 +02:00
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
Result::Ok(())
|
2024-09-03 12:01:01 +02:00
|
|
|
})?;
|
|
|
|
|
2024-10-10 22:42:37 +02:00
|
|
|
// required to into_inner the new_fields_ids_map
|
2024-10-03 18:08:09 +02:00
|
|
|
drop(fields_ids_map_store);
|
|
|
|
|
|
|
|
let fields_ids_map = new_fields_ids_map.into_inner().unwrap();
|
2024-09-03 12:01:01 +02:00
|
|
|
index.put_fields_ids_map(wtxn, &fields_ids_map)?;
|
|
|
|
|
2024-10-16 09:27:00 +02:00
|
|
|
if let Some(new_primary_key) = new_primary_key {
|
|
|
|
index.put_primary_key(wtxn, new_primary_key.name())?;
|
|
|
|
}
|
|
|
|
|
2024-09-30 11:35:03 +02:00
|
|
|
// used to update the localized and weighted maps while sharing the update code with the settings pipeline.
|
|
|
|
let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn)?;
|
|
|
|
inner_index_settings.recompute_facets(wtxn, index)?;
|
|
|
|
inner_index_settings.recompute_searchables(wtxn, index)?;
|
|
|
|
|
2024-10-09 13:53:34 +02:00
|
|
|
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
|
|
|
|
2024-09-03 12:01:01 +02:00
|
|
|
Ok(())
|
2024-09-02 10:42:19 +02:00
|
|
|
}
|
2024-09-02 19:39:48 +02:00
|
|
|
|
2024-10-01 16:13:08 +02:00
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
|
|
|
fn compute_prefix_database(
|
|
|
|
index: &Index,
|
|
|
|
wtxn: &mut RwTxn,
|
|
|
|
prefix_delta: PrefixDelta,
|
|
|
|
) -> Result<()> {
|
2024-10-03 10:40:31 +02:00
|
|
|
eprintln!("prefix_delta: {:?}", &prefix_delta);
|
2024-10-01 16:13:08 +02:00
|
|
|
let PrefixDelta { modified, deleted } = prefix_delta;
|
|
|
|
// Compute word prefix docids
|
|
|
|
compute_word_prefix_docids(wtxn, index, &modified, &deleted)?;
|
2024-10-14 11:12:10 +02:00
|
|
|
// Compute exact word prefix docids
|
|
|
|
compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted)?;
|
2024-10-01 16:13:08 +02:00
|
|
|
// Compute word prefix fid docids
|
|
|
|
compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted)?;
|
|
|
|
// Compute word prefix position docids
|
|
|
|
compute_word_prefix_position_docids(wtxn, index, &modified, &deleted)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_field_ids")]
|
|
|
|
fn compute_facet_level_database(
|
|
|
|
index: &Index,
|
|
|
|
wtxn: &mut RwTxn,
|
|
|
|
facet_field_ids_delta: FacetFieldIdsDelta,
|
|
|
|
) -> Result<()> {
|
2024-10-03 10:40:31 +02:00
|
|
|
eprintln!("facet_field_ids_delta: {:?}", &facet_field_ids_delta);
|
2024-10-01 16:13:08 +02:00
|
|
|
if let Some(modified_facet_string_ids) = facet_field_ids_delta.modified_facet_string_ids() {
|
|
|
|
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string");
|
|
|
|
let _entered = span.enter();
|
|
|
|
FacetsUpdateBulk::new_not_updating_level_0(
|
|
|
|
index,
|
|
|
|
modified_facet_string_ids,
|
|
|
|
FacetType::String,
|
|
|
|
)
|
|
|
|
.execute(wtxn)?;
|
|
|
|
}
|
|
|
|
if let Some(modified_facet_number_ids) = facet_field_ids_delta.modified_facet_number_ids() {
|
|
|
|
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number");
|
|
|
|
let _entered = span.enter();
|
|
|
|
FacetsUpdateBulk::new_not_updating_level_0(
|
|
|
|
index,
|
|
|
|
modified_facet_number_ids,
|
|
|
|
FacetType::Number,
|
|
|
|
)
|
|
|
|
.execute(wtxn)?;
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2024-10-16 09:27:00 +02:00
|
|
|
/// Returns the primary key that has already been set for this index or the
|
|
|
|
/// one we will guess by searching for the first key that contains "id" as a substring,
|
|
|
|
/// and whether the primary key changed
|
2024-09-02 19:39:48 +02:00
|
|
|
/// TODO move this elsewhere
|
2024-09-11 15:59:30 +02:00
|
|
|
pub fn retrieve_or_guess_primary_key<'a>(
|
2024-09-02 19:39:48 +02:00
|
|
|
rtxn: &'a RoTxn<'a>,
|
|
|
|
index: &Index,
|
2024-10-16 09:27:00 +02:00
|
|
|
new_fields_ids_map: &mut FieldsIdsMap,
|
|
|
|
primary_key_from_op: Option<&'a str>,
|
|
|
|
first_document: Option<&'a TopLevelMap<'a>>,
|
|
|
|
) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> {
|
|
|
|
// make sure that we have a declared primary key, either fetching it from the index or attempting to guess it.
|
|
|
|
|
|
|
|
// do we have an existing declared primary key?
|
|
|
|
let (primary_key, has_changed) = if let Some(primary_key_from_db) = index.primary_key(rtxn)? {
|
|
|
|
// did we request a primary key in the operation?
|
|
|
|
match primary_key_from_op {
|
|
|
|
// we did, and it is different from the DB one
|
|
|
|
Some(primary_key_from_op) if primary_key_from_op != primary_key_from_db => {
|
|
|
|
// is the index empty?
|
|
|
|
if index.number_of_documents(rtxn)? == 0 {
|
|
|
|
// change primary key
|
|
|
|
(primary_key_from_op, true)
|
|
|
|
} else {
|
|
|
|
return Ok(Err(UserError::PrimaryKeyCannotBeChanged(
|
|
|
|
primary_key_from_db.to_string(),
|
|
|
|
)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
_ => (primary_key_from_db, false),
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// no primary key in the DB => let's set one
|
|
|
|
// did we request a primary key in the operation?
|
|
|
|
let primary_key = if let Some(primary_key_from_op) = primary_key_from_op {
|
|
|
|
// set primary key from operation
|
|
|
|
primary_key_from_op
|
|
|
|
} else {
|
|
|
|
// guess primary key
|
2024-09-11 15:59:30 +02:00
|
|
|
let first_document = match first_document {
|
|
|
|
Some(document) => document,
|
2024-10-16 09:27:00 +02:00
|
|
|
// previous indexer when no pk is set + we send an empty payload => index_primary_key_no_candidate_found
|
2024-09-11 15:59:30 +02:00
|
|
|
None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut guesses: Vec<&str> = first_document
|
|
|
|
.keys()
|
|
|
|
.map(AsRef::as_ref)
|
|
|
|
.filter(|name| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
|
2024-09-02 19:39:48 +02:00
|
|
|
.collect();
|
|
|
|
|
2024-09-11 15:59:30 +02:00
|
|
|
// sort the keys in lexicographical order, so that fields are always in the same order.
|
|
|
|
guesses.sort_unstable();
|
2024-09-02 19:39:48 +02:00
|
|
|
|
|
|
|
match guesses.as_slice() {
|
2024-10-16 09:27:00 +02:00
|
|
|
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
2024-09-11 15:59:30 +02:00
|
|
|
[name] => {
|
2024-09-02 19:39:48 +02:00
|
|
|
tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
|
2024-10-16 09:27:00 +02:00
|
|
|
*name
|
|
|
|
}
|
|
|
|
multiple => {
|
|
|
|
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
|
|
|
candidates: multiple
|
|
|
|
.iter()
|
|
|
|
.map(|candidate| candidate.to_string())
|
|
|
|
.collect(),
|
|
|
|
}))
|
2024-09-02 19:39:48 +02:00
|
|
|
}
|
|
|
|
}
|
2024-10-16 09:27:00 +02:00
|
|
|
};
|
|
|
|
(primary_key, true)
|
|
|
|
};
|
|
|
|
|
|
|
|
match PrimaryKey::new_or_insert(primary_key, new_fields_ids_map) {
|
|
|
|
Ok(primary_key) => Ok(Ok((primary_key, has_changed))),
|
|
|
|
Err(err) => Ok(Err(err)),
|
2024-09-02 19:39:48 +02:00
|
|
|
}
|
|
|
|
}
|