mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 12:05:05 +08:00
finish work
This commit is contained in:
parent
b5e4a55af6
commit
02c3d6b265
@ -34,6 +34,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
let max_positions_per_attributes = max_positions_per_attributes
|
let max_positions_per_attributes = max_positions_per_attributes
|
||||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
let force_reindexing = settings_diff.reindex_searchable();
|
||||||
|
|
||||||
// initialize destination values.
|
// initialize destination values.
|
||||||
let mut documents_ids = RoaringBitmap::new();
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
@ -54,12 +55,15 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
let mut value_buffer = Vec::new();
|
let mut value_buffer = Vec::new();
|
||||||
|
|
||||||
// initialize tokenizer.
|
// initialize tokenizer.
|
||||||
// TODO: Fix ugly allocation
|
/// TODO: Fix ugly allocation
|
||||||
let old_stop_words = settings_diff.old.stop_words.as_ref();
|
let old_stop_words = settings_diff.old.stop_words.as_ref();
|
||||||
let old_separators: Option<Vec<_>> =
|
let old_separators: Option<Vec<_>> = settings_diff
|
||||||
settings_diff.old.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
|
.old
|
||||||
|
.allowed_separators
|
||||||
|
.as_ref()
|
||||||
|
.map(|s| s.iter().map(String::as_str).collect());
|
||||||
let old_dictionary: Option<Vec<_>> =
|
let old_dictionary: Option<Vec<_>> =
|
||||||
settings_diff.old.dictionary.map(|s| s.iter().map(String::as_str).collect());
|
settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
let mut del_builder = tokenizer_builder(
|
let mut del_builder = tokenizer_builder(
|
||||||
old_stop_words,
|
old_stop_words,
|
||||||
old_separators.as_deref(),
|
old_separators.as_deref(),
|
||||||
@ -68,12 +72,15 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
);
|
);
|
||||||
let del_tokenizer = del_builder.build();
|
let del_tokenizer = del_builder.build();
|
||||||
|
|
||||||
// TODO: Fix ugly allocation
|
/// TODO: Fix ugly allocation
|
||||||
let new_stop_words = settings_diff.new.stop_words.as_ref();
|
let new_stop_words = settings_diff.new.stop_words.as_ref();
|
||||||
let new_separators: Option<Vec<_>> =
|
let new_separators: Option<Vec<_>> = settings_diff
|
||||||
settings_diff.new.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
|
.new
|
||||||
|
.allowed_separators
|
||||||
|
.as_ref()
|
||||||
|
.map(|s| s.iter().map(String::as_str).collect());
|
||||||
let new_dictionary: Option<Vec<_>> =
|
let new_dictionary: Option<Vec<_>> =
|
||||||
settings_diff.new.dictionary.map(|s| s.iter().map(String::as_str).collect());
|
settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
let mut add_builder = tokenizer_builder(
|
let mut add_builder = tokenizer_builder(
|
||||||
new_stop_words,
|
new_stop_words,
|
||||||
new_separators.as_deref(),
|
new_separators.as_deref(),
|
||||||
@ -92,10 +99,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
let obkv = KvReader::<FieldId>::new(value);
|
let obkv = KvReader::<FieldId>::new(value);
|
||||||
|
|
||||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||||
if !searchable_fields_changed(
|
if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
|
||||||
&KvReader::<FieldId>::new(value),
|
|
||||||
&settings_diff.new.searchable_fields_ids,
|
|
||||||
) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,8 +184,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
/// Check if any searchable fields of a document changed.
|
/// Check if any searchable fields of a document changed.
|
||||||
fn searchable_fields_changed(
|
fn searchable_fields_changed(
|
||||||
obkv: &KvReader<FieldId>,
|
obkv: &KvReader<FieldId>,
|
||||||
searchable_fields: &Option<Vec<FieldId>>,
|
settings_diff: &InnerIndexSettingsDiff,
|
||||||
) -> bool {
|
) -> bool {
|
||||||
|
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||||
for (field_id, field_bytes) in obkv.iter() {
|
for (field_id, field_bytes) in obkv.iter() {
|
||||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||||
let del_add = KvReaderDelAdd::new(field_bytes);
|
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||||
@ -262,12 +267,14 @@ fn lang_safe_tokens_from_document<'a>(
|
|||||||
// then we don't rerun the extraction.
|
// then we don't rerun the extraction.
|
||||||
if !script_language.is_empty() {
|
if !script_language.is_empty() {
|
||||||
// build a new temporary tokenizer including the allow list.
|
// build a new temporary tokenizer including the allow list.
|
||||||
// TODO: Fix ugly allocation
|
/// TODO: Fix ugly allocation
|
||||||
let stop_words = settings.stop_words.as_ref();
|
let stop_words = settings.stop_words.as_ref();
|
||||||
let separators: Option<Vec<_>> =
|
let separators: Option<Vec<_>> = settings
|
||||||
settings.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
|
.allowed_separators
|
||||||
|
.as_ref()
|
||||||
|
.map(|s| s.iter().map(String::as_str).collect());
|
||||||
let dictionary: Option<Vec<_>> =
|
let dictionary: Option<Vec<_>> =
|
||||||
settings.dictionary.map(|s| s.iter().map(String::as_str).collect());
|
settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||||
let mut builder =
|
let mut builder =
|
||||||
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
|
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
|
||||||
let tokenizer = builder.build();
|
let tokenizer = builder.build();
|
||||||
|
@ -17,8 +17,9 @@ use crate::error::UserError;
|
|||||||
use crate::prompt::Prompt;
|
use crate::prompt::Prompt;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::helpers::try_split_at;
|
use crate::update::index_documents::helpers::try_split_at;
|
||||||
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::vector::Embedder;
|
use crate::vector::Embedder;
|
||||||
use crate::{DocumentId, FieldsIdsMap, InternalError, Result, VectorOrArrayOfVectors};
|
use crate::{DocumentId, InternalError, Result, VectorOrArrayOfVectors};
|
||||||
|
|
||||||
/// The length of the elements that are always in the buffer when inserting new values.
|
/// The length of the elements that are always in the buffer when inserting new values.
|
||||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||||
@ -71,12 +72,15 @@ impl VectorStateDelta {
|
|||||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||||
obkv_documents: grenad::Reader<R>,
|
obkv_documents: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
field_id_map: &FieldsIdsMap,
|
settings_diff: &InnerIndexSettingsDiff,
|
||||||
prompt: &Prompt,
|
prompt: &Prompt,
|
||||||
embedder_name: &str,
|
embedder_name: &str,
|
||||||
) -> Result<ExtractedVectorPoints> {
|
) -> Result<ExtractedVectorPoints> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
|
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||||
|
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||||
|
|
||||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||||
let mut manual_vectors_writer = create_writer(
|
let mut manual_vectors_writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
@ -98,8 +102,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
|
|
||||||
let vectors_fid = field_id_map.id("_vectors");
|
|
||||||
|
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
let mut cursor = obkv_documents.into_cursor()?;
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
@ -116,15 +118,29 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
// lazily get it when needed
|
// lazily get it when needed
|
||||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||||
|
|
||||||
let vectors_field = vectors_fid
|
// the vector field id may have changed
|
||||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
let old_vectors_fid = old_fields_ids_map.id("_vectors");
|
||||||
.map(KvReaderDelAdd::new)
|
// filter the old vector fid if the settings has been changed forcing reindexing.
|
||||||
.map(|obkv| to_vector_maps(obkv, document_id))
|
let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors());
|
||||||
.transpose()?;
|
|
||||||
|
|
||||||
let (del_map, add_map) = vectors_field.unzip();
|
let new_vectors_fid = new_fields_ids_map.id("_vectors");
|
||||||
let del_map = del_map.flatten();
|
let vectors_field = {
|
||||||
let add_map = add_map.flatten();
|
let del = old_vectors_fid
|
||||||
|
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||||
|
.map(KvReaderDelAdd::new)
|
||||||
|
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id))
|
||||||
|
.transpose()?
|
||||||
|
.flatten();
|
||||||
|
let add = new_vectors_fid
|
||||||
|
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||||
|
.map(KvReaderDelAdd::new)
|
||||||
|
.map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id))
|
||||||
|
.transpose()?
|
||||||
|
.flatten();
|
||||||
|
(del, add)
|
||||||
|
};
|
||||||
|
|
||||||
|
let (del_map, add_map) = vectors_field;
|
||||||
|
|
||||||
let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
|
let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
|
||||||
let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
|
let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
|
||||||
@ -155,7 +171,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
VectorStateDelta::NowGenerated(prompt.render(
|
VectorStateDelta::NowGenerated(prompt.render(
|
||||||
obkv,
|
obkv,
|
||||||
DelAdd::Addition,
|
DelAdd::Addition,
|
||||||
field_id_map,
|
&new_fields_ids_map,
|
||||||
)?)
|
)?)
|
||||||
} else {
|
} else {
|
||||||
VectorStateDelta::NowRemoved
|
VectorStateDelta::NowRemoved
|
||||||
@ -182,9 +198,10 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
if document_is_kept {
|
if document_is_kept {
|
||||||
// Don't give up if the old prompt was failing
|
// Don't give up if the old prompt was failing
|
||||||
let old_prompt =
|
let old_prompt = prompt
|
||||||
prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default();
|
.render(obkv, DelAdd::Deletion, &old_fields_ids_map)
|
||||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?;
|
.unwrap_or_default();
|
||||||
|
let new_prompt = prompt.render(obkv, DelAdd::Addition, &new_fields_ids_map)?;
|
||||||
if old_prompt != new_prompt {
|
if old_prompt != new_prompt {
|
||||||
tracing::trace!(
|
tracing::trace!(
|
||||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||||
@ -220,15 +237,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn to_vector_maps(
|
|
||||||
obkv: KvReaderDelAdd,
|
|
||||||
document_id: impl Fn() -> Value,
|
|
||||||
) -> Result<(Option<serde_json::Map<String, Value>>, Option<serde_json::Map<String, Value>>)> {
|
|
||||||
let del = to_vector_map(obkv, DelAdd::Deletion, &document_id)?;
|
|
||||||
let add = to_vector_map(obkv, DelAdd::Addition, &document_id)?;
|
|
||||||
Ok((del, add))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn to_vector_map(
|
fn to_vector_map(
|
||||||
obkv: KvReaderDelAdd,
|
obkv: KvReaderDelAdd,
|
||||||
side: DelAdd,
|
side: DelAdd,
|
||||||
|
@ -121,16 +121,16 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
let (w, fid) = StrBEU16Codec::bytes_decode(key)
|
let (w, fid) = StrBEU16Codec::bytes_decode(key)
|
||||||
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
|
|
||||||
if let Some(word) = word {
|
if let Some(current) = word.as_ref() {
|
||||||
if word.as_str() != w {
|
if current != w {
|
||||||
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
|
docids_into_writers(¤t, &deletions, &additions, &mut word_docids_writer)?;
|
||||||
docids_into_writers(
|
docids_into_writers(
|
||||||
&word,
|
¤t,
|
||||||
&exact_deletions,
|
&exact_deletions,
|
||||||
&exact_additions,
|
&exact_additions,
|
||||||
&mut exact_word_docids_writer,
|
&mut exact_word_docids_writer,
|
||||||
);
|
)?;
|
||||||
let word = Some(w.to_string());
|
word = Some(w.to_string());
|
||||||
// clear buffers
|
// clear buffers
|
||||||
deletions.clear();
|
deletions.clear();
|
||||||
additions.clear();
|
additions.clear();
|
||||||
@ -138,7 +138,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
exact_additions.clear();
|
exact_additions.clear();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let word = Some(w.to_string());
|
word = Some(w.to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
// merge all deletions
|
// merge all deletions
|
||||||
@ -169,13 +169,13 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Some(word) = word {
|
if let Some(word) = word {
|
||||||
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
|
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer)?;
|
||||||
docids_into_writers(
|
docids_into_writers(
|
||||||
&word,
|
&word,
|
||||||
&exact_deletions,
|
&exact_deletions,
|
||||||
&exact_additions,
|
&exact_additions,
|
||||||
&mut exact_word_docids_writer,
|
&mut exact_word_docids_writer,
|
||||||
);
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
@ -253,7 +253,7 @@ where
|
|||||||
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
|
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
|
||||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||||
})?,
|
})?,
|
||||||
);
|
)?;
|
||||||
}
|
}
|
||||||
// additions:
|
// additions:
|
||||||
if !additions.is_empty() {
|
if !additions.is_empty() {
|
||||||
@ -262,7 +262,7 @@ where
|
|||||||
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
|
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
|
||||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||||
})?,
|
})?,
|
||||||
);
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// insert everything in the same writer.
|
// insert everything in the same writer.
|
||||||
|
@ -11,7 +11,7 @@ use super::helpers::{
|
|||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::{DocumentId, Result};
|
use crate::{DocumentId, Result};
|
||||||
@ -24,9 +24,20 @@ use crate::{DocumentId, Result};
|
|||||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
_settings_diff: &InnerIndexSettingsDiff,
|
settings_diff: &InnerIndexSettingsDiff,
|
||||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||||
|
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||||
|
|
||||||
|
// early return if the data shouldn't be deleted nor created.
|
||||||
|
if !any_deletion && !any_addition {
|
||||||
|
return tempfile::tempfile()
|
||||||
|
.map_err(Into::into)
|
||||||
|
.map(BufReader::new)
|
||||||
|
.and_then(grenad::Reader::new)
|
||||||
|
.map_err(Into::into);
|
||||||
|
}
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
@ -79,6 +90,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||||
|| {
|
|| {
|
||||||
|
if !any_deletion {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
// deletions
|
// deletions
|
||||||
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
|
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
|
||||||
for (position, word) in KvReaderU16::new(deletion).iter() {
|
for (position, word) in KvReaderU16::new(deletion).iter() {
|
||||||
@ -108,6 +123,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||||||
Ok(())
|
Ok(())
|
||||||
},
|
},
|
||||||
|| {
|
|| {
|
||||||
|
if !any_addition {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
// additions
|
// additions
|
||||||
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||||
for (position, word) in KvReaderU16::new(addition).iter() {
|
for (position, word) in KvReaderU16::new(addition).iter() {
|
||||||
|
@ -9,7 +9,6 @@ mod extract_word_docids;
|
|||||||
mod extract_word_pair_proximity_docids;
|
mod extract_word_pair_proximity_docids;
|
||||||
mod extract_word_position_docids;
|
mod extract_word_position_docids;
|
||||||
|
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
|
|
||||||
@ -30,7 +29,6 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
|
|||||||
use self::extract_word_position_docids::extract_word_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::proximity::ProximityPrecision;
|
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::{FieldId, Result};
|
use crate::{FieldId, Result};
|
||||||
|
|
||||||
@ -200,12 +198,14 @@ fn run_extraction_task<FE, FS, M>(
|
|||||||
M: Send,
|
M: Send,
|
||||||
{
|
{
|
||||||
let current_span = tracing::Span::current();
|
let current_span = tracing::Span::current();
|
||||||
|
/// TODO: remove clone
|
||||||
|
let settings_diff = settings_diff.clone();
|
||||||
|
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
||||||
let _entered = child_span.enter();
|
let _entered = child_span.enter();
|
||||||
puffin::profile_scope!("extract_multiple_chunks", name);
|
puffin::profile_scope!("extract_multiple_chunks", name);
|
||||||
match extract_fn(chunk, indexer, settings_diff) {
|
match extract_fn(chunk, indexer, &settings_diff) {
|
||||||
Ok(chunk) => {
|
Ok(chunk) => {
|
||||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||||
}
|
}
|
||||||
@ -235,50 +235,54 @@ fn send_original_documents_data(
|
|||||||
.thread_name(|index| format!("embedding-request-{index}"))
|
.thread_name(|index| format!("embedding-request-{index}"))
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
rayon::spawn(move || {
|
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
|
||||||
for (name, (embedder, prompt)) in embedders {
|
/// TODO: remove clone
|
||||||
let result = extract_vector_points(
|
let settings_diff = settings_diff.clone();
|
||||||
documents_chunk_cloned.clone(),
|
rayon::spawn(move || {
|
||||||
indexer,
|
for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
|
||||||
&field_id_map,
|
let result = extract_vector_points(
|
||||||
&prompt,
|
documents_chunk_cloned.clone(),
|
||||||
&name,
|
indexer,
|
||||||
);
|
&settings_diff,
|
||||||
match result {
|
&prompt,
|
||||||
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
&name,
|
||||||
let embeddings = match extract_embeddings(
|
);
|
||||||
|
match result {
|
||||||
|
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
||||||
|
let embeddings = match extract_embeddings(
|
||||||
prompts,
|
prompts,
|
||||||
indexer,
|
indexer,
|
||||||
embedder.clone(),
|
embedder.clone(),
|
||||||
&request_threads,
|
&request_threads,
|
||||||
) {
|
) {
|
||||||
Ok(results) => Some(results),
|
Ok(results) => Some(results),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if !(remove_vectors.is_empty()
|
if !(remove_vectors.is_empty()
|
||||||
&& manual_vectors.is_empty()
|
&& manual_vectors.is_empty()
|
||||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
||||||
{
|
{
|
||||||
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
|
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
|
||||||
remove_vectors,
|
remove_vectors,
|
||||||
embeddings,
|
embeddings,
|
||||||
expected_dimension: embedder.dimensions(),
|
expected_dimension: embedder.dimensions(),
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embedder_name: name,
|
embedder_name: name,
|
||||||
}));
|
}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(error) => {
|
||||||
|
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(error) => {
|
|
||||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
});
|
}
|
||||||
|
|
||||||
// TODO: create a custom internal error
|
// TODO: create a custom internal error
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
|
||||||
|
@ -6,7 +6,6 @@ mod typed_chunk;
|
|||||||
|
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::io::{Read, Seek};
|
use std::io::{Read, Seek};
|
||||||
use std::iter::FromIterator;
|
|
||||||
use std::num::NonZeroU32;
|
use std::num::NonZeroU32;
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
@ -281,7 +280,7 @@ where
|
|||||||
|
|
||||||
let TransformOutput {
|
let TransformOutput {
|
||||||
primary_key,
|
primary_key,
|
||||||
settings_diff,
|
mut settings_diff,
|
||||||
field_distribution,
|
field_distribution,
|
||||||
documents_count,
|
documents_count,
|
||||||
original_documents,
|
original_documents,
|
||||||
@ -319,13 +318,8 @@ where
|
|||||||
) = crossbeam_channel::unbounded();
|
) = crossbeam_channel::unbounded();
|
||||||
|
|
||||||
// get the primary key field id
|
// get the primary key field id
|
||||||
let primary_key_id = output.settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
|
let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
|
||||||
|
|
||||||
// get searchable fields for word databases
|
|
||||||
let searchable_fields =
|
|
||||||
self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
|
|
||||||
// get filterable fields for facet databases
|
|
||||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
|
||||||
// get the fid of the `_geo.lat` and `_geo.lng` fields.
|
// get the fid of the `_geo.lat` and `_geo.lng` fields.
|
||||||
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
|
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
|
||||||
|
|
||||||
@ -348,12 +342,6 @@ where
|
|||||||
None => None,
|
None => None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
|
||||||
let separators = self.index.allowed_separators(self.wtxn)?;
|
|
||||||
let dictionary = self.index.dictionary(self.wtxn)?;
|
|
||||||
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
|
||||||
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
|
|
||||||
|
|
||||||
let pool_params = GrenadParameters {
|
let pool_params = GrenadParameters {
|
||||||
chunk_compression_type: self.indexer_config.chunk_compression_type,
|
chunk_compression_type: self.indexer_config.chunk_compression_type,
|
||||||
chunk_compression_level: self.indexer_config.chunk_compression_level,
|
chunk_compression_level: self.indexer_config.chunk_compression_level,
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::btree_map::Entry as BEntry;
|
use std::collections::btree_map::Entry as BEntry;
|
||||||
use std::collections::hash_map::Entry as HEntry;
|
use std::collections::hash_map::Entry as HEntry;
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{Read, Seek};
|
use std::io::{Read, Seek};
|
||||||
|
|
||||||
use fxhash::FxHashMap;
|
use fxhash::FxHashMap;
|
||||||
use heed::RoTxn;
|
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use obkv::{KvReader, KvReaderU16, KvWriter};
|
use obkv::{KvReader, KvReaderU16, KvWriter};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -814,7 +813,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let settings_diff = InnerIndexSettingsDiff {
|
let settings_diff = InnerIndexSettingsDiff {
|
||||||
old: old_inner_settings,
|
old: old_inner_settings,
|
||||||
new: new_inner_settings,
|
new: new_inner_settings,
|
||||||
embedding_configs_updated: true,
|
embedding_configs_updated: false,
|
||||||
|
settings_update_only: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(TransformOutput {
|
Ok(TransformOutput {
|
||||||
@ -844,13 +844,16 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
obkv_writer.insert(id, val)?;
|
obkv_writer.insert(id, val)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let new_obkv = KvReader::<FieldId>::new(&obkv_writer.into_inner()?);
|
let data = obkv_writer.into_inner()?;
|
||||||
|
let new_obkv = KvReader::<FieldId>::new(&data);
|
||||||
|
|
||||||
// take the non-flattened version if flatten_from_fields_ids_map returns None.
|
// take the non-flattened version if flatten_from_fields_ids_map returns None.
|
||||||
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?
|
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?;
|
||||||
.map_or_else(|| old_obkv, |bytes| KvReader::<FieldId>::new(&bytes));
|
let old_flattened =
|
||||||
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?
|
old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::<FieldId>::new);
|
||||||
.map_or_else(|| new_obkv, |bytes| KvReader::<FieldId>::new(&bytes));
|
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?;
|
||||||
|
let new_flattened =
|
||||||
|
new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::<FieldId>::new);
|
||||||
|
|
||||||
original_obkv_buffer.clear();
|
original_obkv_buffer.clear();
|
||||||
flattened_obkv_buffer.clear();
|
flattened_obkv_buffer.clear();
|
||||||
|
@ -1010,6 +1010,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
}
|
}
|
||||||
Setting::NotSet => false,
|
Setting::NotSet => false,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// if any changes force a reindexing
|
||||||
|
// clear the vector database.
|
||||||
|
if update {
|
||||||
|
self.index.vector_arroy.clear(self.wtxn)?;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(update)
|
Ok(update)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1077,6 +1084,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
old: old_inner_settings,
|
old: old_inner_settings,
|
||||||
new: new_inner_settings,
|
new: new_inner_settings,
|
||||||
embedding_configs_updated,
|
embedding_configs_updated,
|
||||||
|
settings_update_only: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
if inner_settings_diff.any_reindexing_needed() {
|
if inner_settings_diff.any_reindexing_needed() {
|
||||||
@ -1087,20 +1095,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct InnerIndexSettingsDiff {
|
#[derive(Clone)]
|
||||||
pub old: InnerIndexSettings,
|
pub struct InnerIndexSettingsDiff {
|
||||||
pub new: InnerIndexSettings,
|
pub(crate) old: InnerIndexSettings,
|
||||||
|
pub(crate) new: InnerIndexSettings,
|
||||||
|
|
||||||
// TODO: compare directly the embedders.
|
// TODO: compare directly the embedders.
|
||||||
pub embedding_configs_updated: bool,
|
pub(crate) embedding_configs_updated: bool,
|
||||||
|
|
||||||
|
pub(crate) settings_update_only: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl InnerIndexSettingsDiff {
|
impl InnerIndexSettingsDiff {
|
||||||
fn any_reindexing_needed(&self) -> bool {
|
pub fn any_reindexing_needed(&self) -> bool {
|
||||||
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
|
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reindex_searchable(&self) -> bool {
|
pub fn reindex_searchable(&self) -> bool {
|
||||||
self.old
|
self.old
|
||||||
.fields_ids_map
|
.fields_ids_map
|
||||||
.iter()
|
.iter()
|
||||||
@ -1115,13 +1126,13 @@ impl InnerIndexSettingsDiff {
|
|||||||
|| self.old.proximity_precision != self.new.proximity_precision
|
|| self.old.proximity_precision != self.new.proximity_precision
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reindex_facets(&self) -> bool {
|
pub fn reindex_facets(&self) -> bool {
|
||||||
let existing_fields = self.new.existing_fields;
|
let existing_fields = &self.new.existing_fields;
|
||||||
if existing_fields.iter().any(|field| field.contains('.')) {
|
if existing_fields.iter().any(|field| field.contains('.')) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
let old_faceted_fields = self.old.user_defined_faceted_fields;
|
let old_faceted_fields = &self.old.user_defined_faceted_fields;
|
||||||
if old_faceted_fields.iter().any(|field| field.contains('.')) {
|
if old_faceted_fields.iter().any(|field| field.contains('.')) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -1129,13 +1140,13 @@ impl InnerIndexSettingsDiff {
|
|||||||
// If there is new faceted fields we indicate that we must reindex as we must
|
// If there is new faceted fields we indicate that we must reindex as we must
|
||||||
// index new fields as facets. It means that the distinct attribute,
|
// index new fields as facets. It means that the distinct attribute,
|
||||||
// an Asc/Desc criterion or a filtered attribute as be added or removed.
|
// an Asc/Desc criterion or a filtered attribute as be added or removed.
|
||||||
let new_faceted_fields = self.new.user_defined_faceted_fields;
|
let new_faceted_fields = &self.new.user_defined_faceted_fields;
|
||||||
if new_faceted_fields.iter().any(|field| field.contains('.')) {
|
if new_faceted_fields.iter().any(|field| field.contains('.')) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
let faceted_updated =
|
let faceted_updated =
|
||||||
(&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields);
|
(existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields);
|
||||||
|
|
||||||
self.old
|
self.old
|
||||||
.fields_ids_map
|
.fields_ids_map
|
||||||
@ -1145,9 +1156,13 @@ impl InnerIndexSettingsDiff {
|
|||||||
|| faceted_updated
|
|| faceted_updated
|
||||||
}
|
}
|
||||||
|
|
||||||
fn reindex_vectors(&self) -> bool {
|
pub fn reindex_vectors(&self) -> bool {
|
||||||
self.embedding_configs_updated
|
self.embedding_configs_updated
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn settings_update_only(&self) -> bool {
|
||||||
|
self.settings_update_only
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
|
Loading…
Reference in New Issue
Block a user