finish work

This commit is contained in:
ManyTheFish 2024-04-03 11:19:45 +02:00
parent b5e4a55af6
commit 02c3d6b265
8 changed files with 171 additions and 127 deletions

View File

@ -34,6 +34,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let max_positions_per_attributes = max_positions_per_attributes let max_positions_per_attributes = max_positions_per_attributes
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let force_reindexing = settings_diff.reindex_searchable();
// initialize destination values. // initialize destination values.
let mut documents_ids = RoaringBitmap::new(); let mut documents_ids = RoaringBitmap::new();
@ -54,12 +55,15 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let mut value_buffer = Vec::new(); let mut value_buffer = Vec::new();
// initialize tokenizer. // initialize tokenizer.
// TODO: Fix ugly allocation /// TODO: Fix ugly allocation
let old_stop_words = settings_diff.old.stop_words.as_ref(); let old_stop_words = settings_diff.old.stop_words.as_ref();
let old_separators: Option<Vec<_>> = let old_separators: Option<Vec<_>> = settings_diff
settings_diff.old.allowed_separators.map(|s| s.iter().map(String::as_str).collect()); .old
.allowed_separators
.as_ref()
.map(|s| s.iter().map(String::as_str).collect());
let old_dictionary: Option<Vec<_>> = let old_dictionary: Option<Vec<_>> =
settings_diff.old.dictionary.map(|s| s.iter().map(String::as_str).collect()); settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut del_builder = tokenizer_builder( let mut del_builder = tokenizer_builder(
old_stop_words, old_stop_words,
old_separators.as_deref(), old_separators.as_deref(),
@ -68,12 +72,15 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
); );
let del_tokenizer = del_builder.build(); let del_tokenizer = del_builder.build();
// TODO: Fix ugly allocation /// TODO: Fix ugly allocation
let new_stop_words = settings_diff.new.stop_words.as_ref(); let new_stop_words = settings_diff.new.stop_words.as_ref();
let new_separators: Option<Vec<_>> = let new_separators: Option<Vec<_>> = settings_diff
settings_diff.new.allowed_separators.map(|s| s.iter().map(String::as_str).collect()); .new
.allowed_separators
.as_ref()
.map(|s| s.iter().map(String::as_str).collect());
let new_dictionary: Option<Vec<_>> = let new_dictionary: Option<Vec<_>> =
settings_diff.new.dictionary.map(|s| s.iter().map(String::as_str).collect()); settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut add_builder = tokenizer_builder( let mut add_builder = tokenizer_builder(
new_stop_words, new_stop_words,
new_separators.as_deref(), new_separators.as_deref(),
@ -92,10 +99,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let obkv = KvReader::<FieldId>::new(value); let obkv = KvReader::<FieldId>::new(value);
// if the searchable fields didn't change, skip the searchable indexing for this document. // if the searchable fields didn't change, skip the searchable indexing for this document.
if !searchable_fields_changed( if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
&KvReader::<FieldId>::new(value),
&settings_diff.new.searchable_fields_ids,
) {
continue; continue;
} }
@ -180,8 +184,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
/// Check if any searchable fields of a document changed. /// Check if any searchable fields of a document changed.
fn searchable_fields_changed( fn searchable_fields_changed(
obkv: &KvReader<FieldId>, obkv: &KvReader<FieldId>,
searchable_fields: &Option<Vec<FieldId>>, settings_diff: &InnerIndexSettingsDiff,
) -> bool { ) -> bool {
let searchable_fields = &settings_diff.new.searchable_fields_ids;
for (field_id, field_bytes) in obkv.iter() { for (field_id, field_bytes) in obkv.iter() {
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
let del_add = KvReaderDelAdd::new(field_bytes); let del_add = KvReaderDelAdd::new(field_bytes);
@ -262,12 +267,14 @@ fn lang_safe_tokens_from_document<'a>(
// then we don't rerun the extraction. // then we don't rerun the extraction.
if !script_language.is_empty() { if !script_language.is_empty() {
// build a new temporary tokenizer including the allow list. // build a new temporary tokenizer including the allow list.
// TODO: Fix ugly allocation /// TODO: Fix ugly allocation
let stop_words = settings.stop_words.as_ref(); let stop_words = settings.stop_words.as_ref();
let separators: Option<Vec<_>> = let separators: Option<Vec<_>> = settings
settings.allowed_separators.map(|s| s.iter().map(String::as_str).collect()); .allowed_separators
.as_ref()
.map(|s| s.iter().map(String::as_str).collect());
let dictionary: Option<Vec<_>> = let dictionary: Option<Vec<_>> =
settings.dictionary.map(|s| s.iter().map(String::as_str).collect()); settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut builder = let mut builder =
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None); tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
let tokenizer = builder.build(); let tokenizer = builder.build();

View File

@ -17,8 +17,9 @@ use crate::error::UserError;
use crate::prompt::Prompt; use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at; use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::Embedder; use crate::vector::Embedder;
use crate::{DocumentId, FieldsIdsMap, InternalError, Result, VectorOrArrayOfVectors}; use crate::{DocumentId, InternalError, Result, VectorOrArrayOfVectors};
/// The length of the elements that are always in the buffer when inserting new values. /// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>(); const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -71,12 +72,15 @@ impl VectorStateDelta {
pub fn extract_vector_points<R: io::Read + io::Seek>( pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
field_id_map: &FieldsIdsMap, settings_diff: &InnerIndexSettingsDiff,
prompt: &Prompt, prompt: &Prompt,
embedder_name: &str, embedder_name: &str,
) -> Result<ExtractedVectorPoints> { ) -> Result<ExtractedVectorPoints> {
puffin::profile_function!(); puffin::profile_function!();
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
// (docid, _index) -> KvWriterDelAdd -> Vector // (docid, _index) -> KvWriterDelAdd -> Vector
let mut manual_vectors_writer = create_writer( let mut manual_vectors_writer = create_writer(
indexer.chunk_compression_type, indexer.chunk_compression_type,
@ -98,8 +102,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
tempfile::tempfile()?, tempfile::tempfile()?,
); );
let vectors_fid = field_id_map.id("_vectors");
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {
@ -116,15 +118,29 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
// lazily get it when needed // lazily get it when needed
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
let vectors_field = vectors_fid // the vector field id may have changed
.and_then(|vectors_fid| obkv.get(vectors_fid)) let old_vectors_fid = old_fields_ids_map.id("_vectors");
.map(KvReaderDelAdd::new) // filter the old vector fid if the settings has been changed forcing reindexing.
.map(|obkv| to_vector_maps(obkv, document_id)) let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors());
.transpose()?;
let (del_map, add_map) = vectors_field.unzip(); let new_vectors_fid = new_fields_ids_map.id("_vectors");
let del_map = del_map.flatten(); let vectors_field = {
let add_map = add_map.flatten(); let del = old_vectors_fid
.and_then(|vectors_fid| obkv.get(vectors_fid))
.map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id))
.transpose()?
.flatten();
let add = new_vectors_fid
.and_then(|vectors_fid| obkv.get(vectors_fid))
.map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id))
.transpose()?
.flatten();
(del, add)
};
let (del_map, add_map) = vectors_field;
let del_value = del_map.and_then(|mut map| map.remove(embedder_name)); let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
let add_value = add_map.and_then(|mut map| map.remove(embedder_name)); let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
@ -155,7 +171,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
VectorStateDelta::NowGenerated(prompt.render( VectorStateDelta::NowGenerated(prompt.render(
obkv, obkv,
DelAdd::Addition, DelAdd::Addition,
field_id_map, &new_fields_ids_map,
)?) )?)
} else { } else {
VectorStateDelta::NowRemoved VectorStateDelta::NowRemoved
@ -182,9 +198,10 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
if document_is_kept { if document_is_kept {
// Don't give up if the old prompt was failing // Don't give up if the old prompt was failing
let old_prompt = let old_prompt = prompt
prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default(); .render(obkv, DelAdd::Deletion, &old_fields_ids_map)
let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?; .unwrap_or_default();
let new_prompt = prompt.render(obkv, DelAdd::Addition, &new_fields_ids_map)?;
if old_prompt != new_prompt { if old_prompt != new_prompt {
tracing::trace!( tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
@ -220,15 +237,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
}) })
} }
fn to_vector_maps(
obkv: KvReaderDelAdd,
document_id: impl Fn() -> Value,
) -> Result<(Option<serde_json::Map<String, Value>>, Option<serde_json::Map<String, Value>>)> {
let del = to_vector_map(obkv, DelAdd::Deletion, &document_id)?;
let add = to_vector_map(obkv, DelAdd::Addition, &document_id)?;
Ok((del, add))
}
fn to_vector_map( fn to_vector_map(
obkv: KvReaderDelAdd, obkv: KvReaderDelAdd,
side: DelAdd, side: DelAdd,

View File

@ -121,16 +121,16 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
let (w, fid) = StrBEU16Codec::bytes_decode(key) let (w, fid) = StrBEU16Codec::bytes_decode(key)
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
if let Some(word) = word { if let Some(current) = word.as_ref() {
if word.as_str() != w { if current != w {
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer); docids_into_writers(&current, &deletions, &additions, &mut word_docids_writer)?;
docids_into_writers( docids_into_writers(
&word, &current,
&exact_deletions, &exact_deletions,
&exact_additions, &exact_additions,
&mut exact_word_docids_writer, &mut exact_word_docids_writer,
); )?;
let word = Some(w.to_string()); word = Some(w.to_string());
// clear buffers // clear buffers
deletions.clear(); deletions.clear();
additions.clear(); additions.clear();
@ -138,7 +138,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
exact_additions.clear(); exact_additions.clear();
} }
} else { } else {
let word = Some(w.to_string()); word = Some(w.to_string());
} }
// merge all deletions // merge all deletions
@ -169,13 +169,13 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
} }
if let Some(word) = word { if let Some(word) = word {
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer); docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer)?;
docids_into_writers( docids_into_writers(
&word, &word,
&exact_deletions, &exact_deletions,
&exact_additions, &exact_additions,
&mut exact_word_docids_writer, &mut exact_word_docids_writer,
); )?;
} }
Ok(( Ok((
@ -253,7 +253,7 @@ where
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| { CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) } SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?, })?,
); )?;
} }
// additions: // additions:
if !additions.is_empty() { if !additions.is_empty() {
@ -262,7 +262,7 @@ where
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| { CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) } SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?, })?,
); )?;
} }
// insert everything in the same writer. // insert everything in the same writer.

View File

@ -11,7 +11,7 @@ use super::helpers::{
}; };
use crate::error::SerializationError; use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
use crate::{DocumentId, Result}; use crate::{DocumentId, Result};
@ -24,9 +24,20 @@ use crate::{DocumentId, Result};
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff, settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!(); puffin::profile_function!();
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
// early return if the data shouldn't be deleted nor created.
if !any_deletion && !any_addition {
return tempfile::tempfile()
.map_err(Into::into)
.map(BufReader::new)
.and_then(grenad::Reader::new)
.map_err(Into::into);
}
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
@ -79,6 +90,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
let (del, add): (Result<_>, Result<_>) = rayon::join( let (del, add): (Result<_>, Result<_>) = rayon::join(
|| { || {
if !any_deletion {
return Ok(());
}
// deletions // deletions
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
for (position, word) in KvReaderU16::new(deletion).iter() { for (position, word) in KvReaderU16::new(deletion).iter() {
@ -108,6 +123,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
Ok(()) Ok(())
}, },
|| { || {
if !any_addition {
return Ok(());
}
// additions // additions
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
for (position, word) in KvReaderU16::new(addition).iter() { for (position, word) in KvReaderU16::new(addition).iter() {

View File

@ -9,7 +9,6 @@ mod extract_word_docids;
mod extract_word_pair_proximity_docids; mod extract_word_pair_proximity_docids;
mod extract_word_position_docids; mod extract_word_position_docids;
use std::collections::HashSet;
use std::fs::File; use std::fs::File;
use std::io::BufReader; use std::io::BufReader;
@ -30,7 +29,6 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
use self::extract_word_position_docids::extract_word_position_docids; use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk}; use super::{helpers, TypedChunk};
use crate::proximity::ProximityPrecision;
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result}; use crate::{FieldId, Result};
@ -200,12 +198,14 @@ fn run_extraction_task<FE, FS, M>(
M: Send, M: Send,
{ {
let current_span = tracing::Span::current(); let current_span = tracing::Span::current();
/// TODO: remove clone
let settings_diff = settings_diff.clone();
rayon::spawn(move || { rayon::spawn(move || {
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks"); let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
let _entered = child_span.enter(); let _entered = child_span.enter();
puffin::profile_scope!("extract_multiple_chunks", name); puffin::profile_scope!("extract_multiple_chunks", name);
match extract_fn(chunk, indexer, settings_diff) { match extract_fn(chunk, indexer, &settings_diff) {
Ok(chunk) => { Ok(chunk) => {
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk))); let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
} }
@ -235,50 +235,54 @@ fn send_original_documents_data(
.thread_name(|index| format!("embedding-request-{index}")) .thread_name(|index| format!("embedding-request-{index}"))
.build()?; .build()?;
rayon::spawn(move || { if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
for (name, (embedder, prompt)) in embedders { /// TODO: remove clone
let result = extract_vector_points( let settings_diff = settings_diff.clone();
documents_chunk_cloned.clone(), rayon::spawn(move || {
indexer, for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
&field_id_map, let result = extract_vector_points(
&prompt, documents_chunk_cloned.clone(),
&name, indexer,
); &settings_diff,
match result { &prompt,
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => { &name,
let embeddings = match extract_embeddings( );
match result {
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
let embeddings = match extract_embeddings(
prompts, prompts,
indexer, indexer,
embedder.clone(), embedder.clone(),
&request_threads, &request_threads,
) { ) {
Ok(results) => Some(results), Ok(results) => Some(results),
Err(error) => { Err(error) => {
let _ = lmdb_writer_sx_cloned.send(Err(error)); let _ = lmdb_writer_sx_cloned.send(Err(error));
None None
} }
}; };
if !(remove_vectors.is_empty() if !(remove_vectors.is_empty()
&& manual_vectors.is_empty() && manual_vectors.is_empty()
&& embeddings.as_ref().map_or(true, |e| e.is_empty())) && embeddings.as_ref().map_or(true, |e| e.is_empty()))
{ {
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints { let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
remove_vectors, remove_vectors,
embeddings, embeddings,
expected_dimension: embedder.dimensions(), expected_dimension: embedder.dimensions(),
manual_vectors, manual_vectors,
embedder_name: name, embedder_name: name,
})); }));
}
}
Err(error) => {
let _ = lmdb_writer_sx_cloned.send(Err(error));
} }
} }
Err(error) => {
let _ = lmdb_writer_sx_cloned.send(Err(error));
}
} }
} });
}); }
// TODO: create a custom internal error // TODO: create a custom internal error
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))); let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));

View File

@ -6,7 +6,6 @@ mod typed_chunk;
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::io::{Read, Seek}; use std::io::{Read, Seek};
use std::iter::FromIterator;
use std::num::NonZeroU32; use std::num::NonZeroU32;
use std::result::Result as StdResult; use std::result::Result as StdResult;
@ -281,7 +280,7 @@ where
let TransformOutput { let TransformOutput {
primary_key, primary_key,
settings_diff, mut settings_diff,
field_distribution, field_distribution,
documents_count, documents_count,
original_documents, original_documents,
@ -319,13 +318,8 @@ where
) = crossbeam_channel::unbounded(); ) = crossbeam_channel::unbounded();
// get the primary key field id // get the primary key field id
let primary_key_id = output.settings_diff.new.fields_ids_map.id(&primary_key).unwrap(); let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
// get searchable fields for word databases
let searchable_fields =
self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
// get filterable fields for facet databases
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
// get the fid of the `_geo.lat` and `_geo.lng` fields. // get the fid of the `_geo.lat` and `_geo.lng` fields.
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?; let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
@ -348,12 +342,6 @@ where
None => None, None => None,
}; };
let stop_words = self.index.stop_words(self.wtxn)?;
let separators = self.index.allowed_separators(self.wtxn)?;
let dictionary = self.index.dictionary(self.wtxn)?;
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
let pool_params = GrenadParameters { let pool_params = GrenadParameters {
chunk_compression_type: self.indexer_config.chunk_compression_type, chunk_compression_type: self.indexer_config.chunk_compression_type,
chunk_compression_level: self.indexer_config.chunk_compression_level, chunk_compression_level: self.indexer_config.chunk_compression_level,

View File

@ -1,12 +1,11 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::btree_map::Entry as BEntry; use std::collections::btree_map::Entry as BEntry;
use std::collections::hash_map::Entry as HEntry; use std::collections::hash_map::Entry as HEntry;
use std::collections::{HashMap, HashSet}; use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use std::io::{Read, Seek}; use std::io::{Read, Seek};
use fxhash::FxHashMap; use fxhash::FxHashMap;
use heed::RoTxn;
use itertools::Itertools; use itertools::Itertools;
use obkv::{KvReader, KvReaderU16, KvWriter}; use obkv::{KvReader, KvReaderU16, KvWriter};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -814,7 +813,8 @@ impl<'a, 'i> Transform<'a, 'i> {
let settings_diff = InnerIndexSettingsDiff { let settings_diff = InnerIndexSettingsDiff {
old: old_inner_settings, old: old_inner_settings,
new: new_inner_settings, new: new_inner_settings,
embedding_configs_updated: true, embedding_configs_updated: false,
settings_update_only: false,
}; };
Ok(TransformOutput { Ok(TransformOutput {
@ -844,13 +844,16 @@ impl<'a, 'i> Transform<'a, 'i> {
obkv_writer.insert(id, val)?; obkv_writer.insert(id, val)?;
} }
} }
let new_obkv = KvReader::<FieldId>::new(&obkv_writer.into_inner()?); let data = obkv_writer.into_inner()?;
let new_obkv = KvReader::<FieldId>::new(&data);
// take the non-flattened version if flatten_from_fields_ids_map returns None. // take the non-flattened version if flatten_from_fields_ids_map returns None.
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)? let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?;
.map_or_else(|| old_obkv, |bytes| KvReader::<FieldId>::new(&bytes)); let old_flattened =
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)? old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::<FieldId>::new);
.map_or_else(|| new_obkv, |bytes| KvReader::<FieldId>::new(&bytes)); let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?;
let new_flattened =
new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::<FieldId>::new);
original_obkv_buffer.clear(); original_obkv_buffer.clear();
flattened_obkv_buffer.clear(); flattened_obkv_buffer.clear();

View File

@ -1010,6 +1010,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
} }
Setting::NotSet => false, Setting::NotSet => false,
}; };
// if any changes force a reindexing
// clear the vector database.
if update {
self.index.vector_arroy.clear(self.wtxn)?;
}
Ok(update) Ok(update)
} }
@ -1077,6 +1084,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
old: old_inner_settings, old: old_inner_settings,
new: new_inner_settings, new: new_inner_settings,
embedding_configs_updated, embedding_configs_updated,
settings_update_only: true,
}; };
if inner_settings_diff.any_reindexing_needed() { if inner_settings_diff.any_reindexing_needed() {
@ -1087,20 +1095,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
} }
} }
pub(crate) struct InnerIndexSettingsDiff { #[derive(Clone)]
pub old: InnerIndexSettings, pub struct InnerIndexSettingsDiff {
pub new: InnerIndexSettings, pub(crate) old: InnerIndexSettings,
pub(crate) new: InnerIndexSettings,
// TODO: compare directly the embedders. // TODO: compare directly the embedders.
pub embedding_configs_updated: bool, pub(crate) embedding_configs_updated: bool,
pub(crate) settings_update_only: bool,
} }
impl InnerIndexSettingsDiff { impl InnerIndexSettingsDiff {
fn any_reindexing_needed(&self) -> bool { pub fn any_reindexing_needed(&self) -> bool {
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors() self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
} }
fn reindex_searchable(&self) -> bool { pub fn reindex_searchable(&self) -> bool {
self.old self.old
.fields_ids_map .fields_ids_map
.iter() .iter()
@ -1115,13 +1126,13 @@ impl InnerIndexSettingsDiff {
|| self.old.proximity_precision != self.new.proximity_precision || self.old.proximity_precision != self.new.proximity_precision
} }
fn reindex_facets(&self) -> bool { pub fn reindex_facets(&self) -> bool {
let existing_fields = self.new.existing_fields; let existing_fields = &self.new.existing_fields;
if existing_fields.iter().any(|field| field.contains('.')) { if existing_fields.iter().any(|field| field.contains('.')) {
return true; return true;
} }
let old_faceted_fields = self.old.user_defined_faceted_fields; let old_faceted_fields = &self.old.user_defined_faceted_fields;
if old_faceted_fields.iter().any(|field| field.contains('.')) { if old_faceted_fields.iter().any(|field| field.contains('.')) {
return true; return true;
} }
@ -1129,13 +1140,13 @@ impl InnerIndexSettingsDiff {
// If there is new faceted fields we indicate that we must reindex as we must // If there is new faceted fields we indicate that we must reindex as we must
// index new fields as facets. It means that the distinct attribute, // index new fields as facets. It means that the distinct attribute,
// an Asc/Desc criterion or a filtered attribute as be added or removed. // an Asc/Desc criterion or a filtered attribute as be added or removed.
let new_faceted_fields = self.new.user_defined_faceted_fields; let new_faceted_fields = &self.new.user_defined_faceted_fields;
if new_faceted_fields.iter().any(|field| field.contains('.')) { if new_faceted_fields.iter().any(|field| field.contains('.')) {
return true; return true;
} }
let faceted_updated = let faceted_updated =
(&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields); (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields);
self.old self.old
.fields_ids_map .fields_ids_map
@ -1145,9 +1156,13 @@ impl InnerIndexSettingsDiff {
|| faceted_updated || faceted_updated
} }
fn reindex_vectors(&self) -> bool { pub fn reindex_vectors(&self) -> bool {
self.embedding_configs_updated self.embedding_configs_updated
} }
pub fn settings_update_only(&self) -> bool {
self.settings_update_only
}
} }
#[derive(Clone)] #[derive(Clone)]