mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-25 19:45:05 +08:00
Find a temporary solution to par into iter on an HashMap
Spoiler: Do not use an HashMap but drain it into a Vec
This commit is contained in:
parent
9b7858fb90
commit
bcb1aa3d22
3
Cargo.lock
generated
3
Cargo.lock
generated
@ -4657,8 +4657,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "roaring"
|
name = "roaring"
|
||||||
version = "0.10.6"
|
version = "0.10.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#348e58c2312fc37c0f351373cc7338cea86cf828"
|
||||||
checksum = "8f4b84ba6e838ceb47b41de5194a60244fac43d9fe03b71dbe8c5a201081d6d1"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck",
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
|
@ -64,3 +64,6 @@ opt-level = 3
|
|||||||
opt-level = 3
|
opt-level = 3
|
||||||
[profile.bench.package.yada]
|
[profile.bench.package.yada]
|
||||||
opt-level = 3
|
opt-level = 3
|
||||||
|
|
||||||
|
[patch.crates-io]
|
||||||
|
roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "clone-iter-slice" }
|
||||||
|
@ -22,19 +22,21 @@ use std::ffi::OsStr;
|
|||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::BufWriter;
|
use std::io::BufWriter;
|
||||||
|
use std::sync::RwLock;
|
||||||
|
|
||||||
use dump::IndexMetadata;
|
use dump::IndexMetadata;
|
||||||
use meilisearch_types::error::Code;
|
use meilisearch_types::error::Code;
|
||||||
use meilisearch_types::heed::{RoTxn, RwTxn};
|
use meilisearch_types::heed::{RoTxn, RwTxn};
|
||||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey};
|
||||||
use meilisearch_types::milli::heed::CompactionOption;
|
use meilisearch_types::milli::heed::CompactionOption;
|
||||||
|
use meilisearch_types::milli::update::new::indexer::{self, guess_primary_key, DocumentChanges};
|
||||||
use meilisearch_types::milli::update::{
|
use meilisearch_types::milli::update::{
|
||||||
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
||||||
};
|
};
|
||||||
use meilisearch_types::milli::vector::parsed_vectors::{
|
use meilisearch_types::milli::vector::parsed_vectors::{
|
||||||
ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME,
|
ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME,
|
||||||
};
|
};
|
||||||
use meilisearch_types::milli::{self, Filter, Object};
|
use meilisearch_types::milli::{self, Filter, Object, UserError};
|
||||||
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
|
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
|
||||||
use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task};
|
use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task};
|
||||||
use meilisearch_types::{compression, Index, VERSION_FILE_NAME};
|
use meilisearch_types::{compression, Index, VERSION_FILE_NAME};
|
||||||
@ -1284,58 +1286,72 @@ impl IndexScheduler {
|
|||||||
let must_stop_processing = self.must_stop_processing.clone();
|
let must_stop_processing = self.must_stop_processing.clone();
|
||||||
let indexer_config = self.index_mapper.indexer_config();
|
let indexer_config = self.index_mapper.indexer_config();
|
||||||
|
|
||||||
if let Some(primary_key) = primary_key {
|
/// TODO manage errors correctly
|
||||||
match index.primary_key(index_wtxn)? {
|
let rtxn = index.read_txn()?;
|
||||||
// if a primary key was set AND had already been defined in the index
|
let first_addition_uuid = operations
|
||||||
// but to a different value, we can make the whole batch fail.
|
.iter()
|
||||||
Some(pk) => {
|
.find_map(|op| match op {
|
||||||
if primary_key != pk {
|
DocumentOperation::Add(content_uuid) => Some(content_uuid),
|
||||||
return Err(milli::Error::from(
|
_ => None,
|
||||||
milli::UserError::PrimaryKeyCannotBeChanged(pk.to_string()),
|
})
|
||||||
)
|
.unwrap();
|
||||||
.into());
|
let content_file = self.file_store.get_update(*first_addition_uuid)?;
|
||||||
}
|
let reader =
|
||||||
}
|
DocumentsBatchReader::from_reader(content_file).map_err(milli::Error::from)?;
|
||||||
// if the primary key was set and there was no primary key set for this index
|
let (cursor, documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||||
// we set it to the received value before starting the indexing process.
|
let primary_key =
|
||||||
None => {
|
guess_primary_key(&rtxn, index, cursor, &documents_batch_index)?.unwrap();
|
||||||
let mut builder =
|
|
||||||
milli::update::Settings::new(index_wtxn, index, indexer_config);
|
|
||||||
builder.set_primary_key(primary_key);
|
|
||||||
builder.execute(
|
|
||||||
|indexing_step| tracing::debug!(update = ?indexing_step),
|
|
||||||
|| must_stop_processing.clone().get(),
|
|
||||||
)?;
|
|
||||||
primary_key_has_been_set = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let config = IndexDocumentsConfig { update_method: method, ..Default::default() };
|
// if let Some(primary_key) = primary_key {
|
||||||
|
// match index.primary_key(index_wtxn)? {
|
||||||
|
// // if a primary key was set AND had already been defined in the index
|
||||||
|
// // but to a different value, we can make the whole batch fail.
|
||||||
|
// Some(pk) => {
|
||||||
|
// if primary_key != pk {
|
||||||
|
// return Err(milli::Error::from(
|
||||||
|
// milli::UserError::PrimaryKeyCannotBeChanged(pk.to_string()),
|
||||||
|
// )
|
||||||
|
// .into());
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// // if the primary key was set and there was no primary key set for this index
|
||||||
|
// // we set it to the received value before starting the indexing process.
|
||||||
|
// None => {
|
||||||
|
// todo!();
|
||||||
|
// let mut builder =
|
||||||
|
// milli::update::Settings::new(index_wtxn, index, indexer_config);
|
||||||
|
// builder.set_primary_key(primary_key);
|
||||||
|
// builder.execute(
|
||||||
|
// |indexing_step| tracing::debug!(update = ?indexing_step),
|
||||||
|
// || must_stop_processing.clone().get(),
|
||||||
|
// )?;
|
||||||
|
// primary_key_has_been_set = true;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
let embedder_configs = index.embedding_configs(index_wtxn)?;
|
// let config = IndexDocumentsConfig { update_method: method, ..Default::default() };
|
||||||
// TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense)
|
|
||||||
let embedders = self.embedders(embedder_configs)?;
|
|
||||||
|
|
||||||
let mut builder = milli::update::IndexDocuments::new(
|
// let embedder_configs = index.embedding_configs(index_wtxn)?;
|
||||||
index_wtxn,
|
// // TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense)
|
||||||
index,
|
// let embedders = self.embedders(embedder_configs)?;
|
||||||
indexer_config,
|
|
||||||
config,
|
|
||||||
|indexing_step| tracing::trace!(?indexing_step, "Update"),
|
|
||||||
|| must_stop_processing.get(),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
|
// let mut builder = milli::update::IndexDocuments::new(
|
||||||
|
// index_wtxn,
|
||||||
|
// index,
|
||||||
|
// indexer_config,
|
||||||
|
// config,
|
||||||
|
// |indexing_step| tracing::trace!(?indexing_step, "Update"),
|
||||||
|
// || must_stop_processing.get(),
|
||||||
|
// )?;
|
||||||
|
|
||||||
|
let mut indexer = indexer::DocumentOperation::new(method);
|
||||||
for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) {
|
for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) {
|
||||||
match operation {
|
match operation {
|
||||||
DocumentOperation::Add(content_uuid) => {
|
DocumentOperation::Add(content_uuid) => {
|
||||||
let content_file = self.file_store.get_update(content_uuid)?;
|
let content_file = self.file_store.get_update(content_uuid)?;
|
||||||
let reader = DocumentsBatchReader::from_reader(content_file)
|
let stats = indexer.add_documents(content_file)?;
|
||||||
.map_err(milli::Error::from)?;
|
// builder = builder.with_embedders(embedders.clone());
|
||||||
let (new_builder, user_result) = builder.add_documents(reader)?;
|
|
||||||
builder = new_builder;
|
|
||||||
|
|
||||||
builder = builder.with_embedders(embedders.clone());
|
|
||||||
|
|
||||||
let received_documents =
|
let received_documents =
|
||||||
if let Some(Details::DocumentAdditionOrUpdate {
|
if let Some(Details::DocumentAdditionOrUpdate {
|
||||||
@ -1349,30 +1365,17 @@ impl IndexScheduler {
|
|||||||
unreachable!();
|
unreachable!();
|
||||||
};
|
};
|
||||||
|
|
||||||
match user_result {
|
|
||||||
Ok(count) => {
|
|
||||||
task.status = Status::Succeeded;
|
task.status = Status::Succeeded;
|
||||||
task.details = Some(Details::DocumentAdditionOrUpdate {
|
task.details = Some(Details::DocumentAdditionOrUpdate {
|
||||||
received_documents,
|
received_documents,
|
||||||
indexed_documents: Some(count),
|
indexed_documents: Some(stats.document_count as u64),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
Err(e) => {
|
|
||||||
task.status = Status::Failed;
|
|
||||||
task.details = Some(Details::DocumentAdditionOrUpdate {
|
|
||||||
received_documents,
|
|
||||||
indexed_documents: Some(0),
|
|
||||||
});
|
|
||||||
task.error = Some(milli::Error::from(e).into());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
DocumentOperation::Delete(document_ids) => {
|
DocumentOperation::Delete(document_ids) => {
|
||||||
let (new_builder, user_result) =
|
let count = document_ids.len();
|
||||||
builder.remove_documents(document_ids)?;
|
indexer.delete_documents(document_ids);
|
||||||
builder = new_builder;
|
|
||||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||||
let count = user_result.unwrap();
|
// let count = user_result.unwrap();
|
||||||
let provided_ids =
|
let provided_ids =
|
||||||
if let Some(Details::DocumentDeletion { provided_ids, .. }) =
|
if let Some(Details::DocumentDeletion { provided_ids, .. }) =
|
||||||
task.details
|
task.details
|
||||||
@ -1386,15 +1389,26 @@ impl IndexScheduler {
|
|||||||
task.status = Status::Succeeded;
|
task.status = Status::Succeeded;
|
||||||
task.details = Some(Details::DocumentDeletion {
|
task.details = Some(Details::DocumentDeletion {
|
||||||
provided_ids,
|
provided_ids,
|
||||||
deleted_documents: Some(count),
|
deleted_documents: Some(count as u64),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !tasks.iter().all(|res| res.error.is_some()) {
|
if !tasks.iter().all(|res| res.error.is_some()) {
|
||||||
let addition = builder.execute()?;
|
let mut fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||||
tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
/// TODO create a pool if needed
|
||||||
|
// let pool = indexer_config.thread_pool.unwrap();
|
||||||
|
let pool = rayon::ThreadPoolBuilder::new().build().unwrap();
|
||||||
|
// let fields_ids_map = RwLock::new(fields_ids_map);
|
||||||
|
let param = (index, &rtxn, &mut fields_ids_map, &primary_key);
|
||||||
|
let document_changes = indexer.document_changes(param)?;
|
||||||
|
indexer::index(index_wtxn, index, &pool, document_changes)?;
|
||||||
|
|
||||||
|
/// TODO we must store it or not?
|
||||||
|
let fields_ids_map = fields_ids_map;
|
||||||
|
|
||||||
|
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||||
} else if primary_key_has_been_set {
|
} else if primary_key_has_been_set {
|
||||||
// Everything failed but we've set a primary key.
|
// Everything failed but we've set a primary key.
|
||||||
// We need to remove it.
|
// We need to remove it.
|
||||||
|
@ -4,6 +4,8 @@ use serde::{Deserialize, Serialize};
|
|||||||
|
|
||||||
use crate::FieldId;
|
use crate::FieldId;
|
||||||
|
|
||||||
|
mod global;
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct FieldsIdsMap {
|
pub struct FieldsIdsMap {
|
||||||
names_ids: BTreeMap<String, FieldId>,
|
names_ids: BTreeMap<String, FieldId>,
|
||||||
|
84
milli/src/fields_ids_map/global.rs
Normal file
84
milli/src/fields_ids_map/global.rs
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::sync::RwLock;
|
||||||
|
|
||||||
|
use crate::{FieldId, FieldsIdsMap};
|
||||||
|
|
||||||
|
/// A fields ids map that can be globally updated to add fields
|
||||||
|
pub struct GlobalFieldsIdsMap<'indexing> {
|
||||||
|
global: &'indexing RwLock<FieldsIdsMap>,
|
||||||
|
local: LocalFieldsIdsMap,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct LocalFieldsIdsMap {
|
||||||
|
names_ids: BTreeMap<String, FieldId>,
|
||||||
|
ids_names: BTreeMap<FieldId, String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LocalFieldsIdsMap {
|
||||||
|
fn new(global: &RwLock<FieldsIdsMap>) -> Self {
|
||||||
|
let global = global.read().unwrap();
|
||||||
|
Self { names_ids: global.names_ids.clone(), ids_names: global.ids_names.clone() }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert(&mut self, name: &str, field_id: FieldId) {
|
||||||
|
self.names_ids.insert(name.to_owned(), field_id);
|
||||||
|
self.ids_names.insert(field_id, name.to_owned());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn name(&self, id: FieldId) -> Option<&str> {
|
||||||
|
self.ids_names.get(&id).map(String::as_str)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn id(&self, name: &str) -> Option<FieldId> {
|
||||||
|
self.names_ids.get(name).copied()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'indexing> GlobalFieldsIdsMap<'indexing> {
|
||||||
|
pub fn new(global: &'indexing RwLock<FieldsIdsMap>) -> Self {
|
||||||
|
Self { local: LocalFieldsIdsMap::new(global), global }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the field id related to a field name, it will create a new field id if the
|
||||||
|
/// name is not already known. Returns `None` if the maximum field id as been reached.
|
||||||
|
pub fn id_or_insert(&mut self, name: &str) -> Option<FieldId> {
|
||||||
|
if let Some(field_id) = self.local.id(name) {
|
||||||
|
return Some(field_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// optimistically lookup the global map
|
||||||
|
let global = self.global.read().unwrap();
|
||||||
|
|
||||||
|
if let Some(field_id) = global.id(name) {
|
||||||
|
self.local.insert(name, field_id);
|
||||||
|
return Some(field_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut global = self.global.write().unwrap();
|
||||||
|
|
||||||
|
if let Some(field_id) = global.id(name) {
|
||||||
|
self.local.insert(name, field_id);
|
||||||
|
return Some(field_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
let field_id = global.insert(name)?;
|
||||||
|
self.local.insert(name, field_id);
|
||||||
|
Some(field_id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the name of a field based on its id.
|
||||||
|
pub fn name(&mut self, id: FieldId) -> Option<&str> {
|
||||||
|
if self.local.name(id).is_none() {
|
||||||
|
let global = self.global.read().unwrap();
|
||||||
|
|
||||||
|
let name = global.name(id)?;
|
||||||
|
self.local.insert(name, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.local.name(id)
|
||||||
|
}
|
||||||
|
}
|
@ -1,65 +0,0 @@
|
|||||||
use std::sync::{Arc, RwLock};
|
|
||||||
|
|
||||||
use crate::{FieldId, FieldsIdsMap};
|
|
||||||
|
|
||||||
/// A fields ids map that can be globally updated to add fields
|
|
||||||
pub struct GlobalFieldsIdsMap {
|
|
||||||
global: Arc<RwLock<FieldsIdsMap>>,
|
|
||||||
local: FieldsIdsMap,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl GlobalFieldsIdsMap {
|
|
||||||
pub fn new(global: FieldsIdsMap) -> Self {
|
|
||||||
Self { local: global.clone(), global: Arc::new(RwLock::new(global)) }
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the number of fields ids in the map.
|
|
||||||
pub fn global_len(&self) -> usize {
|
|
||||||
todo!()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns `true` if the map is empty.
|
|
||||||
pub fn global_is_empty(&self) -> bool {
|
|
||||||
todo!()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the field id related to a field name, it will create a new field id if the
|
|
||||||
/// name is not already known. Returns `None` if the maximum field id as been reached.
|
|
||||||
pub fn insert(&mut self, name: &str) -> Option<FieldId> {
|
|
||||||
match self.names_ids.get(name) {
|
|
||||||
Some(id) => Some(*id),
|
|
||||||
None => {
|
|
||||||
let id = self.next_id?;
|
|
||||||
self.next_id = id.checked_add(1);
|
|
||||||
self.names_ids.insert(name.to_owned(), id);
|
|
||||||
self.ids_names.insert(id, name.to_owned());
|
|
||||||
Some(id)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the id of a field based on its name.
|
|
||||||
pub fn id(&self, name: &str) -> Option<FieldId> {
|
|
||||||
self.names_ids.get(name).copied()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the name of a field based on its id.
|
|
||||||
pub fn name(&self, id: FieldId) -> Option<&str> {
|
|
||||||
self.ids_names.get(&id).map(String::as_str)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Iterate over the ids and names in the ids order.
|
|
||||||
pub fn iter(&self) -> impl Iterator<Item = (FieldId, &str)> {
|
|
||||||
self.ids_names.iter().map(|(id, name)| (*id, name.as_str()))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Iterate over the ids in the order of the ids.
|
|
||||||
pub fn ids(&'_ self) -> impl Iterator<Item = FieldId> + '_ {
|
|
||||||
self.ids_names.keys().copied()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Iterate over the names in the order of the ids.
|
|
||||||
pub fn names(&self) -> impl Iterator<Item = &str> {
|
|
||||||
self.ids_names.values().map(AsRef::as_ref)
|
|
||||||
}
|
|
||||||
}
|
|
@ -28,7 +28,7 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion {
|
|||||||
fn document_changes(
|
fn document_changes(
|
||||||
self,
|
self,
|
||||||
param: Self::Parameter,
|
param: Self::Parameter,
|
||||||
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + 'p> {
|
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
|
||||||
let (index, fields, primary_key) = param;
|
let (index, fields, primary_key) = param;
|
||||||
let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from)));
|
let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from)));
|
||||||
Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| {
|
Ok(self.to_delete.into_iter().par_bridge().map_with(items, |items, docid| {
|
||||||
|
@ -34,6 +34,7 @@ pub struct PayloadStats {
|
|||||||
pub bytes: u64,
|
pub bytes: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
enum InnerDocOp {
|
enum InnerDocOp {
|
||||||
Addition(DocumentOffset),
|
Addition(DocumentOffset),
|
||||||
Deletion,
|
Deletion,
|
||||||
@ -41,6 +42,7 @@ enum InnerDocOp {
|
|||||||
|
|
||||||
/// Represents an offset where a document lives
|
/// Represents an offset where a document lives
|
||||||
/// in an mmapped grenad reader file.
|
/// in an mmapped grenad reader file.
|
||||||
|
#[derive(Clone)]
|
||||||
pub struct DocumentOffset {
|
pub struct DocumentOffset {
|
||||||
/// The mmapped grenad reader file.
|
/// The mmapped grenad reader file.
|
||||||
pub content: Arc<Mmap>, // grenad::Reader
|
pub content: Arc<Mmap>, // grenad::Reader
|
||||||
@ -76,7 +78,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation {
|
|||||||
fn document_changes(
|
fn document_changes(
|
||||||
self,
|
self,
|
||||||
param: Self::Parameter,
|
param: Self::Parameter,
|
||||||
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + 'p> {
|
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
|
||||||
let (index, rtxn, fields_ids_map, primary_key) = param;
|
let (index, rtxn, fields_ids_map, primary_key) = param;
|
||||||
|
|
||||||
let documents_ids = index.documents_ids(rtxn)?;
|
let documents_ids = index.documents_ids(rtxn)?;
|
||||||
@ -170,6 +172,11 @@ impl<'p> DocumentChanges<'p> for DocumentOperation {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// TODO is it the best way to provide FieldsIdsMap to the parallel iterator?
|
||||||
|
let fields_ids_map = fields_ids_map.clone();
|
||||||
|
// We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
|
||||||
|
let docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect();
|
||||||
|
|
||||||
Ok(docids_version_offsets
|
Ok(docids_version_offsets
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.map_with(
|
.map_with(
|
||||||
@ -177,6 +184,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation {
|
|||||||
move |context_pool, (external_docid, (internal_docid, operations))| {
|
move |context_pool, (external_docid, (internal_docid, operations))| {
|
||||||
context_pool.with(|rtxn| {
|
context_pool.with(|rtxn| {
|
||||||
use IndexDocumentsMethod as Idm;
|
use IndexDocumentsMethod as Idm;
|
||||||
|
|
||||||
let document_merge_function = match self.index_documents_method {
|
let document_merge_function = match self.index_documents_method {
|
||||||
Idm::ReplaceDocuments => merge_document_for_replacements,
|
Idm::ReplaceDocuments => merge_document_for_replacements,
|
||||||
Idm::UpdateDocuments => merge_document_for_updates,
|
Idm::UpdateDocuments => merge_document_for_updates,
|
||||||
@ -185,7 +193,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation {
|
|||||||
document_merge_function(
|
document_merge_function(
|
||||||
rtxn,
|
rtxn,
|
||||||
index,
|
index,
|
||||||
fields_ids_map,
|
&fields_ids_map,
|
||||||
internal_docid,
|
internal_docid,
|
||||||
external_docid,
|
external_docid,
|
||||||
&operations,
|
&operations,
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
|
use std::fs::File;
|
||||||
use std::thread::{self, Builder};
|
use std::thread::{self, Builder};
|
||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
pub use document_deletion::DocumentDeletion;
|
pub use document_deletion::DocumentDeletion;
|
||||||
pub use document_operation::DocumentOperation;
|
pub use document_operation::DocumentOperation;
|
||||||
use heed::RwTxn;
|
use heed::{RoTxn, RwTxn};
|
||||||
pub use partial_dump::PartialDump;
|
pub use partial_dump::PartialDump;
|
||||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||||
use rayon::ThreadPool;
|
use rayon::ThreadPool;
|
||||||
@ -15,7 +16,11 @@ use super::channel::{
|
|||||||
};
|
};
|
||||||
use super::document_change::DocumentChange;
|
use super::document_change::DocumentChange;
|
||||||
use super::merger::merge_grenad_entries;
|
use super::merger::merge_grenad_entries;
|
||||||
use crate::{Index, Result};
|
use super::StdResult;
|
||||||
|
use crate::documents::{
|
||||||
|
obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY,
|
||||||
|
};
|
||||||
|
use crate::{Index, Result, UserError};
|
||||||
|
|
||||||
mod document_deletion;
|
mod document_deletion;
|
||||||
mod document_operation;
|
mod document_operation;
|
||||||
@ -28,7 +33,7 @@ pub trait DocumentChanges<'p> {
|
|||||||
fn document_changes(
|
fn document_changes(
|
||||||
self,
|
self,
|
||||||
param: Self::Parameter,
|
param: Self::Parameter,
|
||||||
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + 'p>;
|
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This is the main function of this crate.
|
/// This is the main function of this crate.
|
||||||
@ -40,7 +45,7 @@ pub fn index<PI>(
|
|||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
pool: &ThreadPool,
|
pool: &ThreadPool,
|
||||||
document_changes: PI,
|
_document_changes: PI,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
where
|
where
|
||||||
PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send,
|
PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send,
|
||||||
@ -88,3 +93,56 @@ where
|
|||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// TODO move this elsewhere
|
||||||
|
pub fn guess_primary_key<'a>(
|
||||||
|
rtxn: &'a RoTxn<'a>,
|
||||||
|
index: &Index,
|
||||||
|
mut cursor: DocumentsBatchCursor<File>,
|
||||||
|
documents_batch_index: &'a DocumentsBatchIndex,
|
||||||
|
) -> Result<StdResult<PrimaryKey<'a>, UserError>> {
|
||||||
|
// The primary key *field id* that has already been set for this index or the one
|
||||||
|
// we will guess by searching for the first key that contains "id" as a substring.
|
||||||
|
match index.primary_key(rtxn)? {
|
||||||
|
Some(primary_key) => match PrimaryKey::new(primary_key, documents_batch_index) {
|
||||||
|
Some(primary_key) => Ok(Ok(primary_key)),
|
||||||
|
None => match cursor.next_document()? {
|
||||||
|
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
|
||||||
|
primary_key: primary_key.to_string(),
|
||||||
|
document: obkv_to_object(first_document, documents_batch_index)?,
|
||||||
|
})),
|
||||||
|
None => unreachable!("Called with reader.is_empty()"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
let mut guesses: Vec<(u16, &str)> = documents_batch_index
|
||||||
|
.iter()
|
||||||
|
.filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
|
||||||
|
.map(|(field_id, name)| (*field_id, name.as_str()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// sort the keys in a deterministic, obvious way, so that fields are always in the same order.
|
||||||
|
guesses.sort_by(|(_, left_name), (_, right_name)| {
|
||||||
|
// shortest name first
|
||||||
|
left_name.len().cmp(&right_name.len()).then_with(
|
||||||
|
// then alphabetical order
|
||||||
|
|| left_name.cmp(right_name),
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
match guesses.as_slice() {
|
||||||
|
[] => Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||||
|
[(field_id, name)] => {
|
||||||
|
tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
|
||||||
|
Ok(Ok(PrimaryKey::Flat { name, field_id: *field_id }))
|
||||||
|
}
|
||||||
|
multiple => Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||||
|
candidates: multiple
|
||||||
|
.iter()
|
||||||
|
.map(|(_, candidate)| candidate.to_string())
|
||||||
|
.collect(),
|
||||||
|
})),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -7,7 +7,7 @@ use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId};
|
|||||||
use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};
|
use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};
|
||||||
|
|
||||||
pub struct PartialDump<I> {
|
pub struct PartialDump<I> {
|
||||||
pub iter: I,
|
iter: I,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<I> PartialDump<I> {
|
impl<I> PartialDump<I> {
|
||||||
@ -19,7 +19,7 @@ impl<I> PartialDump<I> {
|
|||||||
impl<'p, I> DocumentChanges<'p> for PartialDump<I>
|
impl<'p, I> DocumentChanges<'p> for PartialDump<I>
|
||||||
where
|
where
|
||||||
I: IntoIterator<Item = Object>,
|
I: IntoIterator<Item = Object>,
|
||||||
I::IntoIter: Send + 'p,
|
I::IntoIter: Send + Clone + 'p,
|
||||||
I::Item: Send,
|
I::Item: Send,
|
||||||
{
|
{
|
||||||
type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>);
|
type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>);
|
||||||
@ -31,7 +31,7 @@ where
|
|||||||
fn document_changes(
|
fn document_changes(
|
||||||
self,
|
self,
|
||||||
param: Self::Parameter,
|
param: Self::Parameter,
|
||||||
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + 'p> {
|
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
|
||||||
let (fields_ids_map, concurrent_available_ids, primary_key) = param;
|
let (fields_ids_map, concurrent_available_ids, primary_key) = param;
|
||||||
|
|
||||||
Ok(self.iter.into_iter().par_bridge().map(|object| {
|
Ok(self.iter.into_iter().par_bridge().map(|object| {
|
||||||
|
@ -12,8 +12,7 @@ impl<'p> DocumentChanges<'p> for UpdateByFunction {
|
|||||||
fn document_changes(
|
fn document_changes(
|
||||||
self,
|
self,
|
||||||
_param: Self::Parameter,
|
_param: Self::Parameter,
|
||||||
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + 'p> {
|
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
|
||||||
todo!();
|
Ok((0..100).into_par_iter().map(|_| todo!()))
|
||||||
Ok(vec![].into_par_iter())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,8 +7,8 @@ use crate::FieldId;
|
|||||||
mod document_change;
|
mod document_change;
|
||||||
mod merger;
|
mod merger;
|
||||||
// mod extract;
|
// mod extract;
|
||||||
// mod global_fields_ids_map;
|
|
||||||
mod channel;
|
mod channel;
|
||||||
|
//mod global_fields_ids_map;
|
||||||
pub mod indexer;
|
pub mod indexer;
|
||||||
mod items_pool;
|
mod items_pool;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user