From 8fff5fc28105c6ca34c28b8fc9e7af31c75cb416 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Mar 2024 12:11:46 +0100 Subject: [PATCH 01/40] update tests --- milli/src/update/settings.rs | 64 ++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index beca4fe51..569938ccf 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1643,6 +1643,70 @@ mod tests { .unwrap() .count(); assert_eq!(count, 4); + + // Set the filterable fields to be the age and the name. + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("age"), S("name") }); + }) + .unwrap(); + + // Check that the displayed fields are correctly set. + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.filterable_fields(&rtxn).unwrap(); + assert_eq!(fields_ids, hashset! { S("age"), S("name") }); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 0 and level 0 facet values. + let count = index + .facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 1, 0]) + .unwrap() + .count(); + assert_eq!(count, 4); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 0 and level 0 facet values. + let count = index + .facet_id_string_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 1, 0]) + .unwrap() + .count(); + assert_eq!(count, 5); + + // Remove the age from the filterable fields. + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("name") }); + }) + .unwrap(); + + // Check that the displayed fields are correctly set. + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.filterable_fields(&rtxn).unwrap(); + assert_eq!(fields_ids, hashset! { S("name") }); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 0 and level 0 facet values. + let count = index + .facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 1, 0]) + .unwrap() + .count(); + assert_eq!(count, 0); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 0 and level 0 facet values. + let count = index + .facet_id_string_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 1, 0]) + .unwrap() + .count(); + assert_eq!(count, 5); } #[test] From 64079fc8946c1a86c0defe2f098eb84a5fc7d202 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Mar 2024 12:17:49 +0100 Subject: [PATCH 02/40] Do more iterations on the settings benchmarks --- workloads/settings-add-remove-filters.json | 2 +- workloads/settings-proximity-precision.json | 2 +- workloads/settings-remove-add-swap-searchable.json | 2 +- workloads/settings-typo.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/workloads/settings-add-remove-filters.json b/workloads/settings-add-remove-filters.json index 04a57c707..12493a8fc 100644 --- a/workloads/settings-add-remove-filters.json +++ b/workloads/settings-add-remove-filters.json @@ -1,6 +1,6 @@ { "name": "settings-add-remove-filters.json", - "run_count": 2, + "run_count": 5, "extra_cli_args": [ "--max-indexing-threads=4" ], diff --git a/workloads/settings-proximity-precision.json b/workloads/settings-proximity-precision.json index 48cfad49d..384f99e37 100644 --- a/workloads/settings-proximity-precision.json +++ b/workloads/settings-proximity-precision.json @@ -1,6 +1,6 @@ { "name": "settings-proximity-precision.json", - "run_count": 2, + "run_count": 5, "extra_cli_args": [ "--max-indexing-threads=4" ], diff --git a/workloads/settings-remove-add-swap-searchable.json b/workloads/settings-remove-add-swap-searchable.json index ba315680f..61db8822e 100644 --- a/workloads/settings-remove-add-swap-searchable.json +++ b/workloads/settings-remove-add-swap-searchable.json @@ -1,6 +1,6 @@ { "name": "settings-remove-add-swap-searchable.json", - "run_count": 2, + "run_count": 5, "extra_cli_args": [ "--max-indexing-threads=4" ], diff --git a/workloads/settings-typo.json b/workloads/settings-typo.json index a272e6d1f..45163bc98 100644 --- a/workloads/settings-typo.json +++ b/workloads/settings-typo.json @@ -1,6 +1,6 @@ { "name": "settings-typo.json", - "run_count": 2, + "run_count": 5, "extra_cli_args": [ "--max-indexing-threads=4" ], From aabce52b1b6d7ddcbe8fdefeda84857dfc8b2d2d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Mar 2024 14:20:46 +0100 Subject: [PATCH 03/40] Fix test --- milli/src/update/settings.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 569938ccf..be5a449b9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1671,7 +1671,7 @@ mod tests { let count = index .facet_id_string_docids .remap_key_type::() - .prefix_iter(&rtxn, &[0, 1, 0]) + .prefix_iter(&rtxn, &[0, 0]) .unwrap() .count(); assert_eq!(count, 5); @@ -1703,7 +1703,7 @@ mod tests { let count = index .facet_id_string_docids .remap_key_type::() - .prefix_iter(&rtxn, &[0, 1, 0]) + .prefix_iter(&rtxn, &[0, 0]) .unwrap() .count(); assert_eq!(count, 5); From 893200ab87ac77aaeeebf0d585bb1fbc31adf327 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Mar 2024 14:33:32 +0100 Subject: [PATCH 04/40] Avoid clearing documents in transform --- milli/src/update/index_documents/transform.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index e5392092f..09bf94ace 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -23,7 +23,7 @@ use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; use crate::update::index_documents::GrenadParameters; -use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; +use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; pub struct TransformOutput { @@ -875,7 +875,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.clear(); into_del_add_obkv( KvReaderU16::new(buffer), - DelAddOperation::Addition, + DelAddOperation::DeletionAndAddition, &mut document_sorter_value_buffer, )?; original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; @@ -916,7 +916,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.clear(); into_del_add_obkv( KvReaderU16::new(&buffer), - DelAddOperation::Addition, + DelAddOperation::DeletionAndAddition, &mut document_sorter_value_buffer, )?; flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; @@ -946,9 +946,6 @@ impl<'a, 'i> Transform<'a, 'i> { let new_facets = output.compute_real_facets(wtxn, self.index)?; self.index.put_faceted_fields(wtxn, &new_facets)?; - // We clear the full database (words-fst, documents ids and documents content). - ClearDocuments::new(wtxn, self.index).execute()?; - Ok(output) } } From a7e368aaa6fd73747d07d2fc441819bf7f52bd37 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 20 Mar 2024 13:37:26 +0100 Subject: [PATCH 05/40] Create InnerIndexSettingsDiffs struct and populate it --- milli/src/update/settings.rs | 167 ++++++++++++++++++++++++++++------- 1 file changed, 133 insertions(+), 34 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index be5a449b9..5b1788242 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -20,7 +20,7 @@ use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; -use crate::{FieldsIdsMap, Index, Result}; +use crate::{FieldId, FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -1066,20 +1066,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; - // Note: this MUST be before `update_sortable` so that we can get the old value to compare with the updated value afterwards - - let existing_fields: HashSet<_> = self - .index - .field_distribution(self.wtxn)? - .into_iter() - .filter_map(|(field, count)| (count != 0).then_some(field)) - .collect(); - let old_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?; + let old_inner_settings = InnerIndexSettings::from_index(&self.index, &self.wtxn)?; let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + // never trigger re-indexing self.update_displayed()?; - self.update_filterable()?; - self.update_sortable()?; self.update_distinct_field()?; self.update_criteria()?; self.update_primary_key()?; @@ -1089,16 +1080,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_max_values_per_facet()?; self.update_sort_facet_values_by()?; self.update_pagination_max_total_hits()?; + self.update_search_cutoff()?; - let faceted_updated = self.update_faceted(existing_fields, old_faceted_fields)?; - let stop_words_updated = self.update_stop_words()?; - let non_separator_tokens_updated = self.update_non_separator_tokens()?; - let separator_tokens_updated = self.update_separator_tokens()?; - let dictionary_updated = self.update_dictionary()?; - let synonyms_updated = self.update_synonyms()?; - let searchable_updated = self.update_searchable()?; - let exact_attributes_updated = self.update_exact_attributes()?; - let proximity_precision = self.update_proximity_precision()?; + // could trigger re-indexing + self.update_filterable()?; + self.update_sortable()?; + self.update_stop_words()?; + self.update_non_separator_tokens()?; + self.update_separator_tokens()?; + self.update_dictionary()?; + self.update_synonyms()?; + self.update_searchable()?; + self.update_exact_attributes()?; + self.update_proximity_precision()?; // TODO: very rough approximation of the needs for reindexing where any change will result in // a full reindexing. // What can be done instead: @@ -1107,20 +1101,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage let embedding_configs_updated = self.update_embedding_configs()?; - // never trigger re-indexing - self.update_search_cutoff()?; + let new_inner_settings = InnerIndexSettings::from_index(&self.index, &self.wtxn)?; + let inner_settings_diff = InnerIndexSettingsDiff { + old: old_inner_settings, + new: new_inner_settings, + embedding_configs_updated, + }; - if stop_words_updated - || non_separator_tokens_updated - || separator_tokens_updated - || dictionary_updated - || faceted_updated - || synonyms_updated - || searchable_updated - || exact_attributes_updated - || proximity_precision - || embedding_configs_updated - { + if inner_settings_diff.any_reindexing_needed() { self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?; } @@ -1156,6 +1144,117 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } +pub(crate) struct InnerIndexSettingsDiff { + old: InnerIndexSettings, + new: InnerIndexSettings, + + // TODO: compare directly the embedders. + embedding_configs_updated: bool, +} + +impl InnerIndexSettingsDiff { + fn any_reindexing_needed(&self) -> bool { + self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors() + } + + fn reindex_searchable(&self) -> bool { + self.old + .fields_ids_map + .iter() + .zip(self.new.fields_ids_map.iter()) + .any(|(old, new)| old != new) + || self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + || self.old.allowed_separators != self.new.allowed_separators + || self.old.dictionary != self.new.dictionary + || self.old.searchable_fields != self.new.searchable_fields + || self.old.exact_attributes != self.new.exact_attributes + || self.old.proximity_precision != self.new.proximity_precision + } + + fn reindex_facets(&self) -> bool { + let existing_fields = self.new.existing_fields; + if existing_fields.iter().any(|field| field.contains('.')) { + return true; + } + + let old_faceted_fields = self.old.user_defined_faceted_fields; + if old_faceted_fields.iter().any(|field| field.contains('.')) { + return true; + } + + // If there is new faceted fields we indicate that we must reindex as we must + // index new fields as facets. It means that the distinct attribute, + // an Asc/Desc criterion or a filtered attribute as be added or removed. + let new_faceted_fields = self.new.user_defined_faceted_fields; + if new_faceted_fields.iter().any(|field| field.contains('.')) { + return true; + } + + let faceted_updated = + (&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields); + + self.old + .fields_ids_map + .iter() + .zip(self.new.fields_ids_map.iter()) + .any(|(old, new)| old != new) + || faceted_updated + } + + fn reindex_vectors(&self) -> bool { + self.embedding_configs_updated + } +} + +#[derive(Clone, Debug)] +pub(crate) struct InnerIndexSettings { + stop_words: Option>>, + allowed_separators: Option>, + dictionary: Option>, + fields_ids_map: FieldsIdsMap, + faceted_fields: HashSet, + searchable_fields: Option>, + exact_attributes: HashSet, + proximity_precision: ProximityPrecision, + embedding_configs: Vec<(String, crate::vector::EmbeddingConfig)>, + existing_fields: HashSet, +} + +impl InnerIndexSettings { + fn from_index(index: &Index, rtxn: &heed::RoTxn) -> Result { + let stop_words = index.stop_words(rtxn)?; + let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); + let allowed_separators = index.allowed_separators(rtxn)?; + let dictionary = index.dictionary(rtxn)?; + let fields_ids_map = index.fields_ids_map(rtxn)?; + let searchable_fields = index.searchable_fields_ids(rtxn)?; + let searchable_fields = searchable_fields.map(|sf| sf.into_iter().collect()); + let faceted_fields = index.faceted_fields_ids(rtxn)?; + let exact_attributes = index.exact_attributes_ids(rtxn)?; + let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); + let embedding_configs = index.embedding_configs(rtxn)?; + let existing_fields: HashSet<_> = index + .field_distribution(rtxn)? + .into_iter() + .filter_map(|(field, count)| (count != 0).then_some(field)) + .collect(); + + Ok(Self { + stop_words, + allowed_separators, + dictionary, + fields_ids_map, + faceted_fields, + searchable_fields, + exact_attributes, + proximity_precision, + embedding_configs, + existing_fields, + }) + } +} + fn validate_prompt( name: &str, new: Setting, From b5e4a55af694e19f9748d16c3f1b5a7da878361f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 26 Mar 2024 13:27:43 +0100 Subject: [PATCH 06/40] refactor faceted and searchable pipeline --- milli/src/index.rs | 21 ++- milli/src/update/del_add.rs | 4 +- .../extract/extract_docid_word_positions.rs | 88 +++++---- .../extract/extract_facet_number_docids.rs | 2 + .../extract/extract_facet_string_docids.rs | 2 + .../extract/extract_fid_docid_facet_values.rs | 14 +- .../extract/extract_fid_word_count_docids.rs | 2 + .../extract/extract_word_docids.rs | 156 +++++++++++---- .../extract_word_pair_proximity_docids.rs | 2 + .../extract/extract_word_position_docids.rs | 2 + .../src/update/index_documents/extract/mod.rs | 78 +++----- milli/src/update/index_documents/mod.rs | 45 +---- milli/src/update/index_documents/transform.rs | 178 ++++++++---------- milli/src/update/settings.rs | 165 ++++++++-------- 14 files changed, 420 insertions(+), 339 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index db31c953a..27b273393 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -678,6 +678,23 @@ impl Index { .get(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) } + /// Identical to `user_defined_searchable_fields`, but returns ids instead. + pub fn user_defined_searchable_fields_ids(&self, rtxn: &RoTxn) -> Result>> { + match self.user_defined_searchable_fields(rtxn)? { + Some(fields) => { + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields { + if let Some(field_id) = fields_ids_map.id(name) { + fields_ids.push(field_id); + } + } + Ok(Some(fields_ids)) + } + None => Ok(None), + } + } + /* filterable fields */ /// Writes the filterable fields names in the database. @@ -824,11 +841,11 @@ impl Index { /// Identical to `user_defined_faceted_fields`, but returns ids instead. pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result> { - let fields = self.faceted_fields(rtxn)?; + let fields = self.user_defined_faceted_fields(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?; let mut fields_ids = HashSet::new(); - for name in fields.into_iter() { + for name in fields { if let Some(field_id) = fields_ids_map.id(&name) { fields_ids.insert(field_id); } diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index 794beb5df..0288858ed 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -71,8 +71,8 @@ pub enum DelAddOperation { /// putting each deletion obkv's keys under an DelAdd::Deletion /// and putting each addition obkv's keys under an DelAdd::Addition pub fn del_add_from_two_obkvs( - deletion: obkv::KvReader, - addition: obkv::KvReader, + deletion: &obkv::KvReader, + addition: &obkv::KvReader, buffer: &mut Vec, ) -> Result<(), std::io::Error> { use itertools::merge_join_by; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index dc4886f00..b1a6bb5a6 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -1,4 +1,4 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io::BufReader; @@ -12,6 +12,7 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; +use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; @@ -25,10 +26,7 @@ pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, R pub fn extract_docid_word_positions( obkv_documents: grenad::Reader, indexer: GrenadParameters, - searchable_fields: &Option>, - stop_words: Option<&fst::Set>>, - allowed_separators: Option<&[&str]>, - dictionary: Option<&[&str]>, + settings_diff: &InnerIndexSettingsDiff, max_positions_per_attributes: Option, ) -> Result<(grenad::Reader>, ScriptLanguageDocidsMap)> { puffin::profile_function!(); @@ -56,8 +54,33 @@ pub fn extract_docid_word_positions( let mut value_buffer = Vec::new(); // initialize tokenizer. - let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None); - let tokenizer = builder.build(); + // TODO: Fix ugly allocation + let old_stop_words = settings_diff.old.stop_words.as_ref(); + let old_separators: Option> = + settings_diff.old.allowed_separators.map(|s| s.iter().map(String::as_str).collect()); + let old_dictionary: Option> = + settings_diff.old.dictionary.map(|s| s.iter().map(String::as_str).collect()); + let mut del_builder = tokenizer_builder( + old_stop_words, + old_separators.as_deref(), + old_dictionary.as_deref(), + None, + ); + let del_tokenizer = del_builder.build(); + + // TODO: Fix ugly allocation + let new_stop_words = settings_diff.new.stop_words.as_ref(); + let new_separators: Option> = + settings_diff.new.allowed_separators.map(|s| s.iter().map(String::as_str).collect()); + let new_dictionary: Option> = + settings_diff.new.dictionary.map(|s| s.iter().map(String::as_str).collect()); + let mut add_builder = tokenizer_builder( + new_stop_words, + new_separators.as_deref(), + new_dictionary.as_deref(), + None, + ); + let add_tokenizer = add_builder.build(); // iterate over documents. let mut cursor = obkv_documents.into_cursor()?; @@ -69,7 +92,10 @@ pub fn extract_docid_word_positions( let obkv = KvReader::::new(value); // if the searchable fields didn't change, skip the searchable indexing for this document. - if !searchable_fields_changed(&KvReader::::new(value), searchable_fields) { + if !searchable_fields_changed( + &KvReader::::new(value), + &settings_diff.new.searchable_fields_ids, + ) { continue; } @@ -85,11 +111,8 @@ pub fn extract_docid_word_positions( // deletions lang_safe_tokens_from_document( &obkv, - searchable_fields, - &tokenizer, - stop_words, - allowed_separators, - dictionary, + &settings_diff.old, + &del_tokenizer, max_positions_per_attributes, DelAdd::Deletion, &mut del_buffers, @@ -99,11 +122,8 @@ pub fn extract_docid_word_positions( // additions lang_safe_tokens_from_document( &obkv, - searchable_fields, - &tokenizer, - stop_words, - allowed_separators, - dictionary, + &settings_diff.new, + &add_tokenizer, max_positions_per_attributes, DelAdd::Addition, &mut add_buffers, @@ -118,8 +138,8 @@ pub fn extract_docid_word_positions( // transforming two KV> into one KV>> value_buffer.clear(); del_add_from_two_obkvs( - KvReader::::new(del_obkv), - KvReader::::new(add_obkv), + &KvReader::::new(del_obkv), + &KvReader::::new(add_obkv), &mut value_buffer, )?; @@ -160,7 +180,7 @@ pub fn extract_docid_word_positions( /// Check if any searchable fields of a document changed. fn searchable_fields_changed( obkv: &KvReader, - searchable_fields: &Option>, + searchable_fields: &Option>, ) -> bool { for (field_id, field_bytes) in obkv.iter() { if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { @@ -206,14 +226,10 @@ fn tokenizer_builder<'a>( /// Extract words mapped with their positions of a document, /// ensuring no Language detection mistakes was made. -#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct fn lang_safe_tokens_from_document<'a>( obkv: &KvReader, - searchable_fields: &Option>, + settings: &InnerIndexSettings, tokenizer: &Tokenizer, - stop_words: Option<&fst::Set>>, - allowed_separators: Option<&[&str]>, - dictionary: Option<&[&str]>, max_positions_per_attributes: u32, del_add: DelAdd, buffers: &'a mut Buffers, @@ -222,7 +238,7 @@ fn lang_safe_tokens_from_document<'a>( tokens_from_document( obkv, - searchable_fields, + &settings.searchable_fields_ids, tokenizer, max_positions_per_attributes, del_add, @@ -246,12 +262,14 @@ fn lang_safe_tokens_from_document<'a>( // then we don't rerun the extraction. if !script_language.is_empty() { // build a new temporary tokenizer including the allow list. - let mut builder = tokenizer_builder( - stop_words, - allowed_separators, - dictionary, - Some(&script_language), - ); + // TODO: Fix ugly allocation + let stop_words = settings.stop_words.as_ref(); + let separators: Option> = + settings.allowed_separators.map(|s| s.iter().map(String::as_str).collect()); + let dictionary: Option> = + settings.dictionary.map(|s| s.iter().map(String::as_str).collect()); + let mut builder = + tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None); let tokenizer = builder.build(); script_language_word_count.clear(); @@ -259,7 +277,7 @@ fn lang_safe_tokens_from_document<'a>( // rerun the extraction. tokens_from_document( obkv, - searchable_fields, + &settings.searchable_fields_ids, &tokenizer, max_positions_per_attributes, del_add, @@ -276,7 +294,7 @@ fn lang_safe_tokens_from_document<'a>( /// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( obkv: &KvReader, - searchable_fields: &Option>, + searchable_fields: &Option>, tokenizer: &Tokenizer, max_positions_per_attributes: u32, del_add: DelAdd, diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 33def5abd..1848a085f 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -10,6 +10,7 @@ use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, }; use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::settings::InnerIndexSettingsDiff; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. @@ -20,6 +21,7 @@ use crate::Result; pub fn extract_facet_number_docids( fid_docid_facet_number: grenad::Reader, indexer: GrenadParameters, + _settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { puffin::profile_function!(); diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 8fdd11ee7..abffe17ab 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -15,6 +15,7 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::{ merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, }; +use crate::update::settings::InnerIndexSettingsDiff; use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -25,6 +26,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; pub fn extract_facet_string_docids( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, + _settings_diff: &InnerIndexSettingsDiff, ) -> Result<(grenad::Reader>, grenad::Reader>)> { puffin::profile_function!(); diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 1f8af372d..030303cd9 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::{BTreeMap, HashSet}; +use std::collections::BTreeMap; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; @@ -20,6 +20,7 @@ use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{create_writer, writer_into_reader}; +use crate::update::settings::InnerIndexSettingsDiff; use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// The length of the elements that are always in the buffer when inserting new values. @@ -43,7 +44,7 @@ pub struct ExtractedFacetValues { pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, - faceted_fields: &HashSet, + settings_diff: &InnerIndexSettingsDiff, geo_fields_ids: Option<(FieldId, FieldId)>, ) -> Result { puffin::profile_function!(); @@ -82,7 +83,9 @@ pub fn extract_fid_docid_facet_values( let obkv = obkv::KvReader::new(value); for (field_id, field_bytes) in obkv.iter() { - if faceted_fields.contains(&field_id) { + let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id); + let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id); + if delete_faceted || add_faceted { numbers_key_buffer.clear(); strings_key_buffer.clear(); @@ -99,11 +102,12 @@ pub fn extract_fid_docid_facet_values( strings_key_buffer.extend_from_slice(docid_bytes); let del_add_obkv = obkv::KvReader::new(field_bytes); - let del_value = match del_add_obkv.get(DelAdd::Deletion) { + let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted) + { Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), None => None, }; - let add_value = match del_add_obkv.get(DelAdd::Addition) { + let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) { Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), None => None, }; diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 305af3630..51e0642da 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -10,6 +10,7 @@ use super::helpers::{ use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::settings::InnerIndexSettingsDiff; use crate::Result; const MAX_COUNTED_WORDS: usize = 30; @@ -23,6 +24,7 @@ const MAX_COUNTED_WORDS: usize = 30; pub fn extract_fid_word_count_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, + _settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { puffin::profile_function!(); diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index f38701dac..2b1f02326 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -1,20 +1,22 @@ -use std::collections::{BTreeSet, HashSet}; +use std::collections::BTreeSet; use std::fs::File; use std::io::{self, BufReader}; -use heed::BytesDecode; +use heed::{BytesDecode, BytesEncode}; use obkv::KvReaderU16; +use roaring::RoaringBitmap; use super::helpers::{ - create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, - try_split_array_at, writer_into_reader, GrenadParameters, + create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at, + writer_into_reader, GrenadParameters, }; use crate::error::SerializationError; use crate::heed_codec::StrBEU16Codec; use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::settings::InnerIndexSettingsDiff; use crate::update::MergeFn; -use crate::{DocumentId, FieldId, Result}; +use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. /// @@ -27,7 +29,7 @@ use crate::{DocumentId, FieldId, Result}; pub fn extract_word_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, - exact_attributes: &HashSet, + settings_diff: &InnerIndexSettingsDiff, ) -> Result<( grenad::Reader>, grenad::Reader>, @@ -43,7 +45,7 @@ pub fn extract_word_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|x| x / 3), + max_memory, ); let mut key_buffer = Vec::new(); let mut del_words = BTreeSet::new(); @@ -85,30 +87,29 @@ pub fn extract_word_docids( add_words.clear(); } - let mut word_docids_sorter = create_sorter( - grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory.map(|x| x / 3), - ); - - let mut exact_word_docids_sorter = create_sorter( - grenad::SortAlgorithm::Unstable, - merge_deladd_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory.map(|x| x / 3), - ); - let mut word_fid_docids_writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, tempfile::tempfile()?, ); + let mut word_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + let mut exact_word_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + let mut word: Option = None; + let mut deletions = RoaringBitmap::new(); + let mut additions = RoaringBitmap::new(); + let mut exact_deletions = RoaringBitmap::new(); + let mut exact_additions = RoaringBitmap::new(); let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; // TODO: replace sorters by writers by accumulating values into a buffer before inserting them. while let Some((key, value)) = iter.next()? { @@ -117,20 +118,69 @@ pub fn extract_word_docids( word_fid_docids_writer.insert(key, value)?; } - let (word, fid) = StrBEU16Codec::bytes_decode(key) + let (w, fid) = StrBEU16Codec::bytes_decode(key) .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - // every words contained in an attribute set to exact must be pushed in the exact_words list. - if exact_attributes.contains(&fid) { - exact_word_docids_sorter.insert(word.as_bytes(), value)?; + if let Some(word) = word { + if word.as_str() != w { + docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer); + docids_into_writers( + &word, + &exact_deletions, + &exact_additions, + &mut exact_word_docids_writer, + ); + let word = Some(w.to_string()); + // clear buffers + deletions.clear(); + additions.clear(); + exact_deletions.clear(); + exact_additions.clear(); + } } else { - word_docids_sorter.insert(word.as_bytes(), value)?; + let word = Some(w.to_string()); + } + + // merge all deletions + let obkv = KvReaderDelAdd::new(value); + if let Some(value) = obkv.get(DelAdd::Deletion) { + let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid); + let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| { + SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) } + })?; + if delete_from_exact { + exact_deletions |= docids; + } else { + deletions |= docids + } + } + // merge all additions + if let Some(value) = obkv.get(DelAdd::Addition) { + let add_in_exact = settings_diff.new.exact_attributes.contains(&fid); + let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| { + SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) } + })?; + if add_in_exact { + exact_additions |= docids; + } else { + additions |= docids + } } } + if let Some(word) = word { + docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer); + docids_into_writers( + &word, + &exact_deletions, + &exact_additions, + &mut exact_word_docids_writer, + ); + } + Ok(( - sorter_into_reader(word_docids_sorter, indexer)?, - sorter_into_reader(exact_word_docids_sorter, indexer)?, + writer_into_reader(word_docids_writer)?, + writer_into_reader(exact_word_docids_writer)?, writer_into_reader(word_fid_docids_writer)?, )) } @@ -178,3 +228,45 @@ fn words_into_sorter( Ok(()) } + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] +fn docids_into_writers( + word: &str, + deletions: &RoaringBitmap, + additions: &RoaringBitmap, + writer: &mut grenad::Writer, +) -> Result<()> +where + W: std::io::Write, +{ + if deletions == additions { + // if the same value is deleted and added, do nothing. + return Ok(()); + } + + // Write each value in the same KvDelAdd before inserting it in the final writer. + let mut obkv = KvWriterDelAdd::memory(); + // deletions: + if !deletions.is_empty() && !deletions.is_subset(additions) { + obkv.insert( + DelAdd::Deletion, + CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| { + SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) } + })?, + ); + } + // additions: + if !additions.is_empty() { + obkv.insert( + DelAdd::Addition, + CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| { + SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) } + })?, + ); + } + + // insert everything in the same writer. + writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?; + + Ok(()) +} diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 82a94ce00..d86d09bc8 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -13,6 +13,7 @@ use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::settings::InnerIndexSettingsDiff; use crate::{DocumentId, Result}; /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. @@ -23,6 +24,7 @@ use crate::{DocumentId, Result}; pub fn extract_word_pair_proximity_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, + _settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { puffin::profile_function!(); diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 4bc553d9a..45a05b0d0 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -11,6 +11,7 @@ use super::helpers::{ use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::settings::InnerIndexSettingsDiff; use crate::update::MergeFn; use crate::{bucketed_position, DocumentId, Result}; @@ -22,6 +23,7 @@ use crate::{bucketed_position, DocumentId, Result}; pub fn extract_word_position_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, + _settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { puffin::profile_function!(); diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 82486f3a8..a6b73efde 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -31,8 +31,8 @@ use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; use crate::proximity::ProximityPrecision; -use crate::vector::EmbeddingConfigs; -use crate::{FieldId, FieldsIdsMap, Result}; +use crate::update::settings::InnerIndexSettingsDiff; +use crate::{FieldId, Result}; /// Extract data for each databases from obkv documents in parallel. /// Send data in grenad file over provided Sender. @@ -43,18 +43,10 @@ pub(crate) fn data_from_obkv_documents( flattened_obkv_chunks: impl Iterator>>> + Send, indexer: GrenadParameters, lmdb_writer_sx: Sender>, - searchable_fields: Option>, - faceted_fields: HashSet, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, - field_id_map: FieldsIdsMap, - stop_words: Option>>, - allowed_separators: Option<&[&str]>, - dictionary: Option<&[&str]>, + settings_diff: &InnerIndexSettingsDiff, max_positions_per_attributes: Option, - exact_attributes: HashSet, - proximity_precision: ProximityPrecision, - embedders: EmbeddingConfigs, ) -> Result<()> { puffin::profile_function!(); @@ -67,8 +59,7 @@ pub(crate) fn data_from_obkv_documents( original_documents_chunk, indexer, lmdb_writer_sx.clone(), - field_id_map.clone(), - embedders.clone(), + settings_diff, ) }) .collect::>() @@ -81,13 +72,9 @@ pub(crate) fn data_from_obkv_documents( flattened_obkv_chunks, indexer, lmdb_writer_sx.clone(), - &searchable_fields, - &faceted_fields, primary_key_id, geo_fields_ids, - &stop_words, - &allowed_separators, - &dictionary, + settings_diff, max_positions_per_attributes, ) }) @@ -100,13 +87,12 @@ pub(crate) fn data_from_obkv_documents( run_extraction_task::<_, _, grenad::Reader>>( docid_word_positions_chunk.clone(), indexer, + settings_diff, lmdb_writer_sx.clone(), extract_fid_word_count_docids, TypedChunk::FieldIdWordCountDocids, "field-id-wordcount-docids", ); - - let exact_attributes = exact_attributes.clone(); run_extraction_task::< _, _, @@ -118,10 +104,9 @@ pub(crate) fn data_from_obkv_documents( >( docid_word_positions_chunk.clone(), indexer, + settings_diff, lmdb_writer_sx.clone(), - move |doc_word_pos, indexer| { - extract_word_docids(doc_word_pos, indexer, &exact_attributes) - }, + extract_word_docids, |( word_docids_reader, exact_word_docids_reader, @@ -139,6 +124,7 @@ pub(crate) fn data_from_obkv_documents( run_extraction_task::<_, _, grenad::Reader>>( docid_word_positions_chunk.clone(), indexer, + settings_diff, lmdb_writer_sx.clone(), extract_word_position_docids, TypedChunk::WordPositionDocids, @@ -152,6 +138,7 @@ pub(crate) fn data_from_obkv_documents( >( fid_docid_facet_strings_chunk.clone(), indexer, + settings_diff, lmdb_writer_sx.clone(), extract_facet_string_docids, TypedChunk::FieldIdFacetStringDocids, @@ -161,22 +148,22 @@ pub(crate) fn data_from_obkv_documents( run_extraction_task::<_, _, grenad::Reader>>( fid_docid_facet_numbers_chunk.clone(), indexer, + settings_diff, lmdb_writer_sx.clone(), extract_facet_number_docids, TypedChunk::FieldIdFacetNumberDocids, "field-id-facet-number-docids", ); - if proximity_precision == ProximityPrecision::ByWord { - run_extraction_task::<_, _, grenad::Reader>>( - docid_word_positions_chunk.clone(), - indexer, - lmdb_writer_sx.clone(), - extract_word_pair_proximity_docids, - TypedChunk::WordPairProximityDocids, - "word-pair-proximity-docids", - ); - } + run_extraction_task::<_, _, grenad::Reader>>( + docid_word_positions_chunk.clone(), + indexer, + settings_diff, + lmdb_writer_sx.clone(), + extract_word_pair_proximity_docids, + TypedChunk::WordPairProximityDocids, + "word-pair-proximity-docids", + ); } Ok(()) @@ -195,12 +182,17 @@ pub(crate) fn data_from_obkv_documents( fn run_extraction_task( chunk: grenad::Reader, indexer: GrenadParameters, + settings_diff: &InnerIndexSettingsDiff, lmdb_writer_sx: Sender>, extract_fn: FE, serialize_fn: FS, name: &'static str, ) where - FE: Fn(grenad::Reader, GrenadParameters) -> Result + FE: Fn( + grenad::Reader, + GrenadParameters, + &InnerIndexSettingsDiff, + ) -> Result + Sync + Send + 'static, @@ -213,7 +205,7 @@ fn run_extraction_task( let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks"); let _entered = child_span.enter(); puffin::profile_scope!("extract_multiple_chunks", name); - match extract_fn(chunk, indexer) { + match extract_fn(chunk, indexer, settings_diff) { Ok(chunk) => { let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk))); } @@ -230,8 +222,7 @@ fn send_original_documents_data( original_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, - field_id_map: FieldsIdsMap, - embedders: EmbeddingConfigs, + settings_diff: &InnerIndexSettingsDiff, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -306,13 +297,9 @@ fn send_and_extract_flattened_documents_data( flattened_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, - searchable_fields: &Option>, - faceted_fields: &HashSet, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, - stop_words: &Option>>, - allowed_separators: &Option<&[&str]>, - dictionary: &Option<&[&str]>, + settings_diff: &InnerIndexSettingsDiff, max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, @@ -341,10 +328,7 @@ fn send_and_extract_flattened_documents_data( extract_docid_word_positions( flattened_documents_chunk.clone(), indexer, - searchable_fields, - stop_words.as_ref(), - *allowed_separators, - *dictionary, + settings_diff, max_positions_per_attributes, )?; @@ -367,7 +351,7 @@ fn send_and_extract_flattened_documents_data( } = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), indexer, - faceted_fields, + settings_diff, geo_fields_ids, )?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d534661da..6bc5b6ff9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -253,27 +253,12 @@ where let number_of_documents = self.index.number_of_documents(self.wtxn)?; return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); } - let output = self + let mut output = self .transform .take() .expect("Invalid document addition state") .output_from_sorter(self.wtxn, &self.progress)?; - let new_facets = output.compute_real_facets(self.wtxn, self.index)?; - self.index.put_faceted_fields(self.wtxn, &new_facets)?; - - // in case new fields were introduced we're going to recreate the searchable fields. - if let Some(faceted_fields) = self.index.user_defined_searchable_fields(self.wtxn)? { - // we can't keep references on the faceted fields while we update the index thus we need to own it. - let faceted_fields: Vec = - faceted_fields.into_iter().map(str::to_string).collect(); - self.index.put_all_searchable_fields_from_fields_ids_map( - self.wtxn, - &faceted_fields.iter().map(String::as_ref).collect::>(), - &output.fields_ids_map, - )?; - } - let indexed_documents = output.documents_count as u64; let number_of_documents = self.execute_raw(output)?; @@ -296,16 +281,17 @@ where let TransformOutput { primary_key, - fields_ids_map, + settings_diff, field_distribution, documents_count, original_documents, flattened_documents, } = output; - // The fields_ids_map is put back to the store now so the rest of the transaction sees an - // up to date field map. - self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + // update the internal facet and searchable list, + // because they might have changed due to the nested documents flattening. + settings_diff.new.recompute_facets(self.wtxn, self.index)?; + settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let backup_pool; let pool = match self.indexer_config.thread_pool { @@ -333,7 +319,7 @@ where ) = crossbeam_channel::unbounded(); // get the primary key field id - let primary_key_id = fields_ids_map.id(&primary_key).unwrap(); + let primary_key_id = output.settings_diff.new.fields_ids_map.id(&primary_key).unwrap(); // get searchable fields for word databases let searchable_fields = @@ -400,8 +386,6 @@ where let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; - let cloned_embedder = self.embedders.clone(); - let mut final_documents_ids = RoaringBitmap::new(); let mut databases_seen = 0; let mut word_position_docids = None; @@ -410,7 +394,6 @@ where let mut exact_word_docids = None; let mut chunk_accumulator = ChunkAccumulator::default(); let mut dimension = HashMap::new(); - let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); let current_span = tracing::Span::current(); @@ -428,10 +411,6 @@ where let flattened_chunk_iter = grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); - let separators: Option> = - separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); - let dictionary: Option> = - dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); let result = original_chunk_iter.and_then(|original_chunk| { let flattened_chunk = flattened_chunk_iter?; // extract all databases from the chunked obkv douments @@ -440,18 +419,10 @@ where flattened_chunk, pool_params, lmdb_writer_sx.clone(), - searchable_fields, - faceted_fields, primary_key_id, geo_fields_ids, - field_id_map, - stop_words, - separators.as_deref(), - dictionary.as_deref(), + &settings_diff, max_positions_per_attributes, - exact_attributes, - proximity_precision, - cloned_embedder, ) }); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 09bf94ace..003353793 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -21,14 +21,17 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; -use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; +use crate::update::del_add::{ + del_add_from_two_obkvs, into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd, +}; use crate::update::index_documents::GrenadParameters; +use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; pub struct TransformOutput { pub primary_key: String, - pub fields_ids_map: FieldsIdsMap, + pub settings_diff: InnerIndexSettingsDiff, pub field_distribution: FieldDistribution, pub documents_count: usize, pub original_documents: File, @@ -282,7 +285,9 @@ impl<'a, 'i> Transform<'a, 'i> { self.original_sorter .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; let base_obkv = KvReader::new(base_obkv); - if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? { + if let Some(flattened_obkv) = + Self::flatten_from_fields_ids_map(&base_obkv, &mut self.fields_ids_map)? + { // we recreate our buffer with the flattened documents document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Addition as u8); @@ -315,7 +320,9 @@ impl<'a, 'i> Transform<'a, 'i> { .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; let flattened_obkv = KvReader::new(&obkv_buffer); - if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { + if let Some(obkv) = + Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)? + { document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( @@ -524,7 +531,9 @@ impl<'a, 'i> Transform<'a, 'i> { // flatten it and push it as to delete in the flattened_sorter let flattened_obkv = KvReader::new(base_obkv); - if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { + if let Some(obkv) = + Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)? + { // we recreate our buffer with the flattened documents document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Deletion as u8); @@ -541,8 +550,15 @@ impl<'a, 'i> Transform<'a, 'i> { // Flatten a document from the fields ids map contained in self and insert the new // created fields. Returns `None` if the document doesn't need to be flattened. - #[tracing::instrument(level = "trace", skip(self, obkv), target = "indexing::transform")] - fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result>> { + #[tracing::instrument( + level = "trace", + skip(obkv, fields_ids_map), + target = "indexing::transform" + )] + fn flatten_from_fields_ids_map( + obkv: &KvReader, + fields_ids_map: &mut FieldsIdsMap, + ) -> Result>> { if obkv .iter() .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value)) @@ -563,7 +579,7 @@ impl<'a, 'i> Transform<'a, 'i> { // all the raw values get inserted directly in the `key_value` vec. for (key, value) in obkv.iter() { if json_depth_checker::should_flatten_from_unchecked_slice(value) { - let key = self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { + let key = fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { field_id: key, process: "Flatten from fields ids map.", })?; @@ -581,7 +597,7 @@ impl<'a, 'i> Transform<'a, 'i> { // Once we have the flattened version we insert all the new generated fields_ids // (if any) in the fields ids map and serialize the value. for (key, value) in flattened.into_iter() { - let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; + let fid = fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; key_value.push((fid, value.into())); } @@ -792,9 +808,18 @@ impl<'a, 'i> Transform<'a, 'i> { fst_new_external_documents_ids_builder.insert(key, value) })?; + let old_inner_settings = InnerIndexSettings::from_index(&self.index, wtxn)?; + let mut new_inner_settings = old_inner_settings.clone(); + new_inner_settings.fields_ids_map = self.fields_ids_map; + let settings_diff = InnerIndexSettingsDiff { + old: old_inner_settings, + new: new_inner_settings, + embedding_configs_updated: true, + }; + Ok(TransformOutput { primary_key, - fields_ids_map: self.fields_ids_map, + settings_diff, field_distribution, documents_count: self.documents_count, original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, @@ -804,6 +829,38 @@ impl<'a, 'i> Transform<'a, 'i> { }) } + fn rebind_existing_document( + old_obkv: KvReader, + settings_diff: &InnerIndexSettingsDiff, + original_obkv_buffer: &mut Vec, + flattened_obkv_buffer: &mut Vec, + ) -> Result<()> { + let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone(); + let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone(); + let mut obkv_writer = KvWriter::<_, FieldId>::memory(); + // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. + for (id, name) in new_fields_ids_map.iter() { + if let Some(val) = old_fields_ids_map.id(name).and_then(|id| old_obkv.get(id)) { + obkv_writer.insert(id, val)?; + } + } + let new_obkv = KvReader::::new(&obkv_writer.into_inner()?); + + // take the non-flattened version if flatten_from_fields_ids_map returns None. + let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)? + .map_or_else(|| old_obkv, |bytes| KvReader::::new(&bytes)); + let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)? + .map_or_else(|| new_obkv, |bytes| KvReader::::new(&bytes)); + + original_obkv_buffer.clear(); + flattened_obkv_buffer.clear(); + + del_add_from_two_obkvs(&old_obkv, &new_obkv, original_obkv_buffer)?; + del_add_from_two_obkvs(&old_flattened, &new_flattened, flattened_obkv_buffer)?; + + Ok(()) + } + /// Clear all databases. Returns a `TransformOutput` with a file that contains the documents /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. /// @@ -811,8 +868,7 @@ impl<'a, 'i> Transform<'a, 'i> { pub fn prepare_for_documents_reindexing( self, wtxn: &mut heed::RwTxn<'i>, - old_fields_ids_map: FieldsIdsMap, - mut new_fields_ids_map: FieldsIdsMap, + settings_diff: InnerIndexSettingsDiff, ) -> Result { // There already has been a document addition, the primary key should be set by now. let primary_key = self @@ -848,78 +904,27 @@ impl<'a, 'i> Transform<'a, 'i> { self.indexer_settings.max_memory.map(|mem| mem / 2), ); - let mut obkv_buffer = Vec::new(); + let mut original_obkv_buffer = Vec::new(); + let mut flattened_obkv_buffer = Vec::new(); let mut document_sorter_key_buffer = Vec::new(); - let mut document_sorter_value_buffer = Vec::new(); for result in self.index.external_documents_ids().iter(wtxn)? { let (external_id, docid) = result?; - let obkv = self.index.documents.get(wtxn, &docid)?.ok_or( + let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or( InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, )?; - obkv_buffer.clear(); - let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer); - - // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. - for (id, name) in new_fields_ids_map.iter() { - if let Some(val) = old_fields_ids_map.id(name).and_then(|id| obkv.get(id)) { - obkv_writer.insert(id, val)?; - } - } - - let buffer = obkv_writer.into_inner()?; + Self::rebind_existing_document( + old_obkv, + &settings_diff, + &mut original_obkv_buffer, + &mut flattened_obkv_buffer, + )?; document_sorter_key_buffer.clear(); document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); - document_sorter_value_buffer.clear(); - into_del_add_obkv( - KvReaderU16::new(buffer), - DelAddOperation::DeletionAndAddition, - &mut document_sorter_value_buffer, - )?; - original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; - - // Once we have the document. We're going to flatten it - // and insert it in the flattened sorter. - let mut doc = serde_json::Map::new(); - - let reader = obkv::KvReader::new(buffer); - for (k, v) in reader.iter() { - let key = new_fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: k, - process: "Accessing field distribution in transform.", - })?; - let value = serde_json::from_slice::(v) - .map_err(InternalError::SerdeJson)?; - doc.insert(key.to_string(), value); - } - - let flattened = flatten_serde_json::flatten(&doc); - - // Once we have the flattened version we can convert it back to obkv and - // insert all the new generated fields_ids (if any) in the fields ids map. - let mut buffer: Vec = Vec::new(); - let mut writer = KvWriter::new(&mut buffer); - let mut flattened: Vec<_> = flattened.into_iter().collect(); - // we reorder the field to get all the known field first - flattened.sort_unstable_by_key(|(key, _)| { - new_fields_ids_map.id(key).unwrap_or(FieldId::MAX) - }); - - for (key, value) in flattened { - let fid = - new_fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; - let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; - writer.insert(fid, &value)?; - } - document_sorter_value_buffer.clear(); - into_del_add_obkv( - KvReaderU16::new(&buffer), - DelAddOperation::DeletionAndAddition, - &mut document_sorter_value_buffer, - )?; - flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; + original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; + flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; } let grenad_params = GrenadParameters { @@ -934,19 +939,14 @@ impl<'a, 'i> Transform<'a, 'i> { let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; - let output = TransformOutput { + Ok(TransformOutput { primary_key, - fields_ids_map: new_fields_ids_map, field_distribution, + settings_diff, documents_count, original_documents: original_documents.into_inner().into_inner(), flattened_documents: flattened_documents.into_inner().into_inner(), - }; - - let new_facets = output.compute_real_facets(wtxn, self.index)?; - self.index.put_faceted_fields(wtxn, &new_facets)?; - - Ok(output) + }) } } @@ -961,20 +961,6 @@ fn drop_and_reuse(mut vec: Vec) -> Vec { vec.into_iter().map(|_| unreachable!()).collect() } -impl TransformOutput { - // find and insert the new field ids - pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result> { - let user_defined_facets = index.user_defined_faceted_fields(rtxn)?; - - Ok(self - .fields_ids_map - .names() - .filter(|&field| crate::is_faceted(field, &user_defined_facets)) - .map(|field| field.to_string()) - .collect()) - } -} - #[cfg(test)] mod test { use super::*; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 5b1788242..ae4304fce 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -385,14 +385,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { #[tracing::instrument( level = "trace" - skip(self, progress_callback, should_abort, old_fields_ids_map), + skip(self, progress_callback, should_abort, settings_diff), target = "indexing::documents" )] fn reindex( &mut self, progress_callback: &FP, should_abort: &FA, - old_fields_ids_map: FieldsIdsMap, + settings_diff: InnerIndexSettingsDiff, ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -416,14 +416,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { )?; // We clear the databases and remap the documents fields based on the new `FieldsIdsMap`. - let output = transform.prepare_for_documents_reindexing( - self.wtxn, - old_fields_ids_map, - fields_ids_map, - )?; - - let embedder_configs = self.index.embedding_configs(self.wtxn)?; - let embedders = self.embedders(embedder_configs)?; + let output = transform.prepare_for_documents_reindexing(self.wtxn, settings_diff)?; // We index the generated `TransformOutput` which must contain // all the documents with fields in the newly defined searchable order. @@ -436,32 +429,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { &should_abort, )?; - let indexing_builder = indexing_builder.with_embedders(embedders); indexing_builder.execute_raw(output)?; Ok(()) } - fn embedders( - &self, - embedding_configs: Vec<(String, EmbeddingConfig)>, - ) -> Result { - let res: Result<_> = embedding_configs - .into_iter() - .map(|(name, EmbeddingConfig { embedder_options, prompt })| { - let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); - - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(crate::vector::Error::from) - .map_err(crate::Error::from)?, - ); - Ok((name, (embedder, prompt))) - }) - .collect(); - res.map(EmbeddingConfigs::new) - } - fn update_displayed(&mut self) -> Result { match self.displayed_fields { Setting::Set(ref fields) => { @@ -1067,7 +1039,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; let old_inner_settings = InnerIndexSettings::from_index(&self.index, &self.wtxn)?; - let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // never trigger re-indexing self.update_displayed()?; @@ -1109,47 +1080,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { }; if inner_settings_diff.any_reindexing_needed() { - self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?; + self.reindex(&progress_callback, &should_abort, inner_settings_diff)?; } Ok(()) } - - fn update_faceted( - &self, - existing_fields: HashSet, - old_faceted_fields: HashSet, - ) -> Result { - if existing_fields.iter().any(|field| field.contains('.')) { - return Ok(true); - } - - if old_faceted_fields.iter().any(|field| field.contains('.')) { - return Ok(true); - } - - // If there is new faceted fields we indicate that we must reindex as we must - // index new fields as facets. It means that the distinct attribute, - // an Asc/Desc criterion or a filtered attribute as be added or removed. - let new_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?; - - if new_faceted_fields.iter().any(|field| field.contains('.')) { - return Ok(true); - } - - let faceted_updated = - (&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields); - - Ok(faceted_updated) - } } pub(crate) struct InnerIndexSettingsDiff { - old: InnerIndexSettings, - new: InnerIndexSettings, + pub old: InnerIndexSettings, + pub new: InnerIndexSettings, // TODO: compare directly the embedders. - embedding_configs_updated: bool, + pub embedding_configs_updated: bool, } impl InnerIndexSettingsDiff { @@ -1167,7 +1110,7 @@ impl InnerIndexSettingsDiff { != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) || self.old.allowed_separators != self.new.allowed_separators || self.old.dictionary != self.new.dictionary - || self.old.searchable_fields != self.new.searchable_fields + || self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields || self.old.exact_attributes != self.new.exact_attributes || self.old.proximity_precision != self.new.proximity_precision } @@ -1207,33 +1150,38 @@ impl InnerIndexSettingsDiff { } } -#[derive(Clone, Debug)] +#[derive(Clone)] pub(crate) struct InnerIndexSettings { - stop_words: Option>>, - allowed_separators: Option>, - dictionary: Option>, - fields_ids_map: FieldsIdsMap, - faceted_fields: HashSet, - searchable_fields: Option>, - exact_attributes: HashSet, - proximity_precision: ProximityPrecision, - embedding_configs: Vec<(String, crate::vector::EmbeddingConfig)>, - existing_fields: HashSet, + pub stop_words: Option>>, + pub allowed_separators: Option>, + pub dictionary: Option>, + pub fields_ids_map: FieldsIdsMap, + pub user_defined_faceted_fields: HashSet, + pub user_defined_searchable_fields: Option>, + pub faceted_fields_ids: HashSet, + pub searchable_fields_ids: Option>, + pub exact_attributes: HashSet, + pub proximity_precision: ProximityPrecision, + pub embedding_configs: EmbeddingConfigs, + pub existing_fields: HashSet, } impl InnerIndexSettings { - fn from_index(index: &Index, rtxn: &heed::RoTxn) -> Result { + pub fn from_index(index: &Index, rtxn: &heed::RoTxn) -> Result { let stop_words = index.stop_words(rtxn)?; let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); let allowed_separators = index.allowed_separators(rtxn)?; let dictionary = index.dictionary(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; - let searchable_fields = index.searchable_fields_ids(rtxn)?; - let searchable_fields = searchable_fields.map(|sf| sf.into_iter().collect()); - let faceted_fields = index.faceted_fields_ids(rtxn)?; + let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?; + let user_defined_searchable_fields = + user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); + let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; + let searchable_fields_ids = index.searchable_fields_ids(rtxn)?; + let faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); - let embedding_configs = index.embedding_configs(rtxn)?; + let embedding_configs = embedders(index.embedding_configs(rtxn)?)?; let existing_fields: HashSet<_> = index .field_distribution(rtxn)? .into_iter() @@ -1245,14 +1193,65 @@ impl InnerIndexSettings { allowed_separators, dictionary, fields_ids_map, - faceted_fields, - searchable_fields, + user_defined_faceted_fields, + user_defined_searchable_fields, + faceted_fields_ids, + searchable_fields_ids, exact_attributes, proximity_precision, embedding_configs, existing_fields, }) } + + // find and insert the new field ids + pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { + let new_facets = self + .fields_ids_map + .names() + .filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields)) + .map(|field| field.to_string()) + .collect(); + index.put_faceted_fields(wtxn, &new_facets)?; + + self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?; + Ok(()) + } + + // find and insert the new field ids + pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { + // in case new fields were introduced we're going to recreate the searchable fields. + if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() { + let searchable_fields = + searchable_fields.iter().map(String::as_ref).collect::>(); + index.put_all_searchable_fields_from_fields_ids_map( + wtxn, + &searchable_fields, + &self.fields_ids_map, + )?; + let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; + self.searchable_fields_ids = searchable_fields_ids; + } + + Ok(()) + } +} + +fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result { + let res: Result<_> = embedding_configs + .into_iter() + .map(|(name, EmbeddingConfig { embedder_options, prompt })| { + let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); + + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(crate::vector::Error::from) + .map_err(crate::Error::from)?, + ); + Ok((name, (embedder, prompt))) + }) + .collect(); + res.map(EmbeddingConfigs::new) } fn validate_prompt( From 02c3d6b26546a9f6f6b4406b3d7d077316d800d9 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 3 Apr 2024 11:19:45 +0200 Subject: [PATCH 07/40] finish work --- .../extract/extract_docid_word_positions.rs | 41 ++++++---- .../extract/extract_vector_points.rs | 58 ++++++++------ .../extract/extract_word_docids.rs | 22 ++--- .../extract_word_pair_proximity_docids.rs | 23 +++++- .../src/update/index_documents/extract/mod.rs | 80 ++++++++++--------- milli/src/update/index_documents/mod.rs | 16 +--- milli/src/update/index_documents/transform.rs | 19 +++-- milli/src/update/settings.rs | 39 ++++++--- 8 files changed, 171 insertions(+), 127 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index b1a6bb5a6..6cf7b3167 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -34,6 +34,7 @@ pub fn extract_docid_word_positions( let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); + let force_reindexing = settings_diff.reindex_searchable(); // initialize destination values. let mut documents_ids = RoaringBitmap::new(); @@ -54,12 +55,15 @@ pub fn extract_docid_word_positions( let mut value_buffer = Vec::new(); // initialize tokenizer. - // TODO: Fix ugly allocation + /// TODO: Fix ugly allocation let old_stop_words = settings_diff.old.stop_words.as_ref(); - let old_separators: Option> = - settings_diff.old.allowed_separators.map(|s| s.iter().map(String::as_str).collect()); + let old_separators: Option> = settings_diff + .old + .allowed_separators + .as_ref() + .map(|s| s.iter().map(String::as_str).collect()); let old_dictionary: Option> = - settings_diff.old.dictionary.map(|s| s.iter().map(String::as_str).collect()); + settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); let mut del_builder = tokenizer_builder( old_stop_words, old_separators.as_deref(), @@ -68,12 +72,15 @@ pub fn extract_docid_word_positions( ); let del_tokenizer = del_builder.build(); - // TODO: Fix ugly allocation + /// TODO: Fix ugly allocation let new_stop_words = settings_diff.new.stop_words.as_ref(); - let new_separators: Option> = - settings_diff.new.allowed_separators.map(|s| s.iter().map(String::as_str).collect()); + let new_separators: Option> = settings_diff + .new + .allowed_separators + .as_ref() + .map(|s| s.iter().map(String::as_str).collect()); let new_dictionary: Option> = - settings_diff.new.dictionary.map(|s| s.iter().map(String::as_str).collect()); + settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); let mut add_builder = tokenizer_builder( new_stop_words, new_separators.as_deref(), @@ -92,10 +99,7 @@ pub fn extract_docid_word_positions( let obkv = KvReader::::new(value); // if the searchable fields didn't change, skip the searchable indexing for this document. - if !searchable_fields_changed( - &KvReader::::new(value), - &settings_diff.new.searchable_fields_ids, - ) { + if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) { continue; } @@ -180,8 +184,9 @@ pub fn extract_docid_word_positions( /// Check if any searchable fields of a document changed. fn searchable_fields_changed( obkv: &KvReader, - searchable_fields: &Option>, + settings_diff: &InnerIndexSettingsDiff, ) -> bool { + let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { let del_add = KvReaderDelAdd::new(field_bytes); @@ -262,12 +267,14 @@ fn lang_safe_tokens_from_document<'a>( // then we don't rerun the extraction. if !script_language.is_empty() { // build a new temporary tokenizer including the allow list. - // TODO: Fix ugly allocation + /// TODO: Fix ugly allocation let stop_words = settings.stop_words.as_ref(); - let separators: Option> = - settings.allowed_separators.map(|s| s.iter().map(String::as_str).collect()); + let separators: Option> = settings + .allowed_separators + .as_ref() + .map(|s| s.iter().map(String::as_str).collect()); let dictionary: Option> = - settings.dictionary.map(|s| s.iter().map(String::as_str).collect()); + settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); let mut builder = tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None); let tokenizer = builder.build(); diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 40b32bf9c..fc79a861f 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -17,8 +17,9 @@ use crate::error::UserError; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::try_split_at; +use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::Embedder; -use crate::{DocumentId, FieldsIdsMap, InternalError, Result, VectorOrArrayOfVectors}; +use crate::{DocumentId, InternalError, Result, VectorOrArrayOfVectors}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -71,12 +72,15 @@ impl VectorStateDelta { pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, - field_id_map: &FieldsIdsMap, + settings_diff: &InnerIndexSettingsDiff, prompt: &Prompt, embedder_name: &str, ) -> Result { puffin::profile_function!(); + let old_fields_ids_map = &settings_diff.old.fields_ids_map; + let new_fields_ids_map = &settings_diff.new.fields_ids_map; + // (docid, _index) -> KvWriterDelAdd -> Vector let mut manual_vectors_writer = create_writer( indexer.chunk_compression_type, @@ -98,8 +102,6 @@ pub fn extract_vector_points( tempfile::tempfile()?, ); - let vectors_fid = field_id_map.id("_vectors"); - let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -116,15 +118,29 @@ pub fn extract_vector_points( // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; - let vectors_field = vectors_fid - .and_then(|vectors_fid| obkv.get(vectors_fid)) - .map(KvReaderDelAdd::new) - .map(|obkv| to_vector_maps(obkv, document_id)) - .transpose()?; + // the vector field id may have changed + let old_vectors_fid = old_fields_ids_map.id("_vectors"); + // filter the old vector fid if the settings has been changed forcing reindexing. + let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors()); - let (del_map, add_map) = vectors_field.unzip(); - let del_map = del_map.flatten(); - let add_map = add_map.flatten(); + let new_vectors_fid = new_fields_ids_map.id("_vectors"); + let vectors_field = { + let del = old_vectors_fid + .and_then(|vectors_fid| obkv.get(vectors_fid)) + .map(KvReaderDelAdd::new) + .map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id)) + .transpose()? + .flatten(); + let add = new_vectors_fid + .and_then(|vectors_fid| obkv.get(vectors_fid)) + .map(KvReaderDelAdd::new) + .map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id)) + .transpose()? + .flatten(); + (del, add) + }; + + let (del_map, add_map) = vectors_field; let del_value = del_map.and_then(|mut map| map.remove(embedder_name)); let add_value = add_map.and_then(|mut map| map.remove(embedder_name)); @@ -155,7 +171,7 @@ pub fn extract_vector_points( VectorStateDelta::NowGenerated(prompt.render( obkv, DelAdd::Addition, - field_id_map, + &new_fields_ids_map, )?) } else { VectorStateDelta::NowRemoved @@ -182,9 +198,10 @@ pub fn extract_vector_points( if document_is_kept { // Don't give up if the old prompt was failing - let old_prompt = - prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default(); - let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?; + let old_prompt = prompt + .render(obkv, DelAdd::Deletion, &old_fields_ids_map) + .unwrap_or_default(); + let new_prompt = prompt.render(obkv, DelAdd::Addition, &new_fields_ids_map)?; if old_prompt != new_prompt { tracing::trace!( "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" @@ -220,15 +237,6 @@ pub fn extract_vector_points( }) } -fn to_vector_maps( - obkv: KvReaderDelAdd, - document_id: impl Fn() -> Value, -) -> Result<(Option>, Option>)> { - let del = to_vector_map(obkv, DelAdd::Deletion, &document_id)?; - let add = to_vector_map(obkv, DelAdd::Addition, &document_id)?; - Ok((del, add)) -} - fn to_vector_map( obkv: KvReaderDelAdd, side: DelAdd, diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 2b1f02326..2be41bb86 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -121,16 +121,16 @@ pub fn extract_word_docids( let (w, fid) = StrBEU16Codec::bytes_decode(key) .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - if let Some(word) = word { - if word.as_str() != w { - docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer); + if let Some(current) = word.as_ref() { + if current != w { + docids_into_writers(¤t, &deletions, &additions, &mut word_docids_writer)?; docids_into_writers( - &word, + ¤t, &exact_deletions, &exact_additions, &mut exact_word_docids_writer, - ); - let word = Some(w.to_string()); + )?; + word = Some(w.to_string()); // clear buffers deletions.clear(); additions.clear(); @@ -138,7 +138,7 @@ pub fn extract_word_docids( exact_additions.clear(); } } else { - let word = Some(w.to_string()); + word = Some(w.to_string()); } // merge all deletions @@ -169,13 +169,13 @@ pub fn extract_word_docids( } if let Some(word) = word { - docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer); + docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer)?; docids_into_writers( &word, &exact_deletions, &exact_additions, &mut exact_word_docids_writer, - ); + )?; } Ok(( @@ -253,7 +253,7 @@ where CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| { SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) } })?, - ); + )?; } // additions: if !additions.is_empty() { @@ -262,7 +262,7 @@ where CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| { SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) } })?, - ); + )?; } // insert everything in the same writer. diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index d86d09bc8..e185566ca 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -11,7 +11,7 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::proximity::{index_proximity, MAX_DISTANCE}; +use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; use crate::{DocumentId, Result}; @@ -24,9 +24,20 @@ use crate::{DocumentId, Result}; pub fn extract_word_pair_proximity_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, - _settings_diff: &InnerIndexSettingsDiff, + settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { puffin::profile_function!(); + let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord; + let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord; + + // early return if the data shouldn't be deleted nor created. + if !any_deletion && !any_addition { + return tempfile::tempfile() + .map_err(Into::into) + .map(BufReader::new) + .and_then(grenad::Reader::new) + .map_err(Into::into); + } let max_memory = indexer.max_memory_by_thread(); @@ -79,6 +90,10 @@ pub fn extract_word_pair_proximity_docids( let (del, add): (Result<_>, Result<_>) = rayon::join( || { + if !any_deletion { + return Ok(()); + } + // deletions if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { for (position, word) in KvReaderU16::new(deletion).iter() { @@ -108,6 +123,10 @@ pub fn extract_word_pair_proximity_docids( Ok(()) }, || { + if !any_addition { + return Ok(()); + } + // additions if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { for (position, word) in KvReaderU16::new(addition).iter() { diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index a6b73efde..924561dea 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -9,7 +9,6 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; -use std::collections::HashSet; use std::fs::File; use std::io::BufReader; @@ -30,7 +29,6 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; -use crate::proximity::ProximityPrecision; use crate::update::settings::InnerIndexSettingsDiff; use crate::{FieldId, Result}; @@ -200,12 +198,14 @@ fn run_extraction_task( M: Send, { let current_span = tracing::Span::current(); + /// TODO: remove clone + let settings_diff = settings_diff.clone(); rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks"); let _entered = child_span.enter(); puffin::profile_scope!("extract_multiple_chunks", name); - match extract_fn(chunk, indexer, settings_diff) { + match extract_fn(chunk, indexer, &settings_diff) { Ok(chunk) => { let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk))); } @@ -235,50 +235,54 @@ fn send_original_documents_data( .thread_name(|index| format!("embedding-request-{index}")) .build()?; - rayon::spawn(move || { - for (name, (embedder, prompt)) in embedders { - let result = extract_vector_points( - documents_chunk_cloned.clone(), - indexer, - &field_id_map, - &prompt, - &name, - ); - match result { - Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => { - let embeddings = match extract_embeddings( + if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() { + /// TODO: remove clone + let settings_diff = settings_diff.clone(); + rayon::spawn(move || { + for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() { + let result = extract_vector_points( + documents_chunk_cloned.clone(), + indexer, + &settings_diff, + &prompt, + &name, + ); + match result { + Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => { + let embeddings = match extract_embeddings( prompts, indexer, embedder.clone(), &request_threads, ) { - Ok(results) => Some(results), - Err(error) => { - let _ = lmdb_writer_sx_cloned.send(Err(error)); - None - } - }; + Ok(results) => Some(results), + Err(error) => { + let _ = lmdb_writer_sx_cloned.send(Err(error)); + None + } + }; - if !(remove_vectors.is_empty() - && manual_vectors.is_empty() - && embeddings.as_ref().map_or(true, |e| e.is_empty())) - { - let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints { - remove_vectors, - embeddings, - expected_dimension: embedder.dimensions(), - manual_vectors, - embedder_name: name, - })); + if !(remove_vectors.is_empty() + && manual_vectors.is_empty() + && embeddings.as_ref().map_or(true, |e| e.is_empty())) + { + let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints { + remove_vectors, + embeddings, + expected_dimension: embedder.dimensions(), + manual_vectors, + embedder_name: name, + })); + } + } + + Err(error) => { + let _ = lmdb_writer_sx_cloned.send(Err(error)); } } - - Err(error) => { - let _ = lmdb_writer_sx_cloned.send(Err(error)); - } } - } - }); + }); + } // TODO: create a custom internal error let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 6bc5b6ff9..c3b081c37 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -6,7 +6,6 @@ mod typed_chunk; use std::collections::{HashMap, HashSet}; use std::io::{Read, Seek}; -use std::iter::FromIterator; use std::num::NonZeroU32; use std::result::Result as StdResult; @@ -281,7 +280,7 @@ where let TransformOutput { primary_key, - settings_diff, + mut settings_diff, field_distribution, documents_count, original_documents, @@ -319,13 +318,8 @@ where ) = crossbeam_channel::unbounded(); // get the primary key field id - let primary_key_id = output.settings_diff.new.fields_ids_map.id(&primary_key).unwrap(); + let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap(); - // get searchable fields for word databases - let searchable_fields = - self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); - // get filterable fields for facet databases - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; // get the fid of the `_geo.lat` and `_geo.lng` fields. let mut field_id_map = self.index.fields_ids_map(self.wtxn)?; @@ -348,12 +342,6 @@ where None => None, }; - let stop_words = self.index.stop_words(self.wtxn)?; - let separators = self.index.allowed_separators(self.wtxn)?; - let dictionary = self.index.dictionary(self.wtxn)?; - let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; - let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default(); - let pool_params = GrenadParameters { chunk_compression_type: self.indexer_config.chunk_compression_type, chunk_compression_level: self.indexer_config.chunk_compression_level, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 003353793..e82600683 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,12 +1,11 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fs::File; use std::io::{Read, Seek}; use fxhash::FxHashMap; -use heed::RoTxn; use itertools::Itertools; use obkv::{KvReader, KvReaderU16, KvWriter}; use roaring::RoaringBitmap; @@ -814,7 +813,8 @@ impl<'a, 'i> Transform<'a, 'i> { let settings_diff = InnerIndexSettingsDiff { old: old_inner_settings, new: new_inner_settings, - embedding_configs_updated: true, + embedding_configs_updated: false, + settings_update_only: false, }; Ok(TransformOutput { @@ -844,13 +844,16 @@ impl<'a, 'i> Transform<'a, 'i> { obkv_writer.insert(id, val)?; } } - let new_obkv = KvReader::::new(&obkv_writer.into_inner()?); + let data = obkv_writer.into_inner()?; + let new_obkv = KvReader::::new(&data); // take the non-flattened version if flatten_from_fields_ids_map returns None. - let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)? - .map_or_else(|| old_obkv, |bytes| KvReader::::new(&bytes)); - let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)? - .map_or_else(|| new_obkv, |bytes| KvReader::::new(&bytes)); + let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?; + let old_flattened = + old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::::new); + let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?; + let new_flattened = + new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::::new); original_obkv_buffer.clear(); flattened_obkv_buffer.clear(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ae4304fce..6c770c0a1 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1010,6 +1010,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } Setting::NotSet => false, }; + + // if any changes force a reindexing + // clear the vector database. + if update { + self.index.vector_arroy.clear(self.wtxn)?; + } + Ok(update) } @@ -1077,6 +1084,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { old: old_inner_settings, new: new_inner_settings, embedding_configs_updated, + settings_update_only: true, }; if inner_settings_diff.any_reindexing_needed() { @@ -1087,20 +1095,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } -pub(crate) struct InnerIndexSettingsDiff { - pub old: InnerIndexSettings, - pub new: InnerIndexSettings, +#[derive(Clone)] +pub struct InnerIndexSettingsDiff { + pub(crate) old: InnerIndexSettings, + pub(crate) new: InnerIndexSettings, // TODO: compare directly the embedders. - pub embedding_configs_updated: bool, + pub(crate) embedding_configs_updated: bool, + + pub(crate) settings_update_only: bool, } impl InnerIndexSettingsDiff { - fn any_reindexing_needed(&self) -> bool { + pub fn any_reindexing_needed(&self) -> bool { self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors() } - fn reindex_searchable(&self) -> bool { + pub fn reindex_searchable(&self) -> bool { self.old .fields_ids_map .iter() @@ -1115,13 +1126,13 @@ impl InnerIndexSettingsDiff { || self.old.proximity_precision != self.new.proximity_precision } - fn reindex_facets(&self) -> bool { - let existing_fields = self.new.existing_fields; + pub fn reindex_facets(&self) -> bool { + let existing_fields = &self.new.existing_fields; if existing_fields.iter().any(|field| field.contains('.')) { return true; } - let old_faceted_fields = self.old.user_defined_faceted_fields; + let old_faceted_fields = &self.old.user_defined_faceted_fields; if old_faceted_fields.iter().any(|field| field.contains('.')) { return true; } @@ -1129,13 +1140,13 @@ impl InnerIndexSettingsDiff { // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, // an Asc/Desc criterion or a filtered attribute as be added or removed. - let new_faceted_fields = self.new.user_defined_faceted_fields; + let new_faceted_fields = &self.new.user_defined_faceted_fields; if new_faceted_fields.iter().any(|field| field.contains('.')) { return true; } let faceted_updated = - (&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields); + (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields); self.old .fields_ids_map @@ -1145,9 +1156,13 @@ impl InnerIndexSettingsDiff { || faceted_updated } - fn reindex_vectors(&self) -> bool { + pub fn reindex_vectors(&self) -> bool { self.embedding_configs_updated } + + pub fn settings_update_only(&self) -> bool { + self.settings_update_only + } } #[derive(Clone)] From a489b406b4b3ff60d3bd97edef3702e91e2767a6 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 3 Apr 2024 19:10:19 +0200 Subject: [PATCH 08/40] fix test --- meilisearch/tests/settings/get_settings.rs | 4 +++- .../extract/extract_docid_word_positions.rs | 3 --- .../src/update/index_documents/extract/mod.rs | 24 +++++++++---------- milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/settings.rs | 1 - 5 files changed, 15 insertions(+), 19 deletions(-) diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index 980ef3064..042dcca41 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -113,7 +113,8 @@ async fn secrets_are_hidden_in_settings() { "default": { "source": "rest", "url": "https://localhost:7777", - "apiKey": "My super secret value you will never guess" + "apiKey": "My super secret value you will never guess", + "dimensions": 4, } } })) @@ -184,6 +185,7 @@ async fn secrets_are_hidden_in_settings() { "default": { "source": "rest", "apiKey": "My suXXXXXX...", + "dimensions": 4, "documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", "url": "https://localhost:7777", "query": null, diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 6cf7b3167..6af5bba6d 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -55,7 +55,6 @@ pub fn extract_docid_word_positions( let mut value_buffer = Vec::new(); // initialize tokenizer. - /// TODO: Fix ugly allocation let old_stop_words = settings_diff.old.stop_words.as_ref(); let old_separators: Option> = settings_diff .old @@ -72,7 +71,6 @@ pub fn extract_docid_word_positions( ); let del_tokenizer = del_builder.build(); - /// TODO: Fix ugly allocation let new_stop_words = settings_diff.new.stop_words.as_ref(); let new_separators: Option> = settings_diff .new @@ -267,7 +265,6 @@ fn lang_safe_tokens_from_document<'a>( // then we don't rerun the extraction. if !script_language.is_empty() { // build a new temporary tokenizer including the allow list. - /// TODO: Fix ugly allocation let stop_words = settings.stop_words.as_ref(); let separators: Option> = settings .allowed_separators diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 924561dea..341cdc9f9 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -198,7 +198,6 @@ fn run_extraction_task( M: Send, { let current_span = tracing::Span::current(); - /// TODO: remove clone let settings_diff = settings_diff.clone(); rayon::spawn(move || { @@ -236,7 +235,6 @@ fn send_original_documents_data( .build()?; if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() { - /// TODO: remove clone let settings_diff = settings_diff.clone(); rayon::spawn(move || { for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() { @@ -250,17 +248,17 @@ fn send_original_documents_data( match result { Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => { let embeddings = match extract_embeddings( - prompts, - indexer, - embedder.clone(), - &request_threads, - ) { - Ok(results) => Some(results), - Err(error) => { - let _ = lmdb_writer_sx_cloned.send(Err(error)); - None - } - }; + prompts, + indexer, + embedder.clone(), + &request_threads, + ) { + Ok(results) => Some(results), + Err(error) => { + let _ = lmdb_writer_sx_cloned.send(Err(error)); + None + } + }; if !(remove_vectors.is_empty() && manual_vectors.is_empty() diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c3b081c37..47f1e9f19 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -252,7 +252,7 @@ where let number_of_documents = self.index.number_of_documents(self.wtxn)?; return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); } - let mut output = self + let output = self .transform .take() .expect("Invalid document addition state") diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 6c770c0a1..ae9ae2801 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -400,7 +400,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { puffin::profile_function!(); - let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // if the settings are set before any document update, we don't need to do anything, and // will set the primary key during the first document addition. if self.index.number_of_documents(self.wtxn)? == 0 { From bad46f88d606dc83de287291fa593e7b1ad6a035 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 4 Apr 2024 11:01:50 +0200 Subject: [PATCH 09/40] Fix embedder test --- index-scheduler/src/lib.rs | 1 + .../test_settings_update/after_registering_settings_task.snap | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 5901e45f8..5704f5354 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -3041,6 +3041,7 @@ mod tests { source: Setting::Set(milli::vector::settings::EmbedderSource::Rest), api_key: Setting::Set(S("My super secret")), url: Setting::Set(S("http://localhost:7777")), + dimensions: Setting::Set(4), ..Default::default() }; embedders.insert(S("default"), Setting::Set(embedding_settings)); diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap index 8c081b84b..205200965 100644 --- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap +++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] From e5ae337aae71354d338aa8475bc6a819a2f1af9c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 4 Apr 2024 17:03:18 +0200 Subject: [PATCH 10/40] Comeback to sorters in extract_word_docids using buffers and merge the keys manually is less efficient --- .../extract/extract_word_docids.rs | 79 ++++++------------- 1 file changed, 26 insertions(+), 53 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 2be41bb86..5699f2fb6 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -14,6 +14,7 @@ use crate::error::SerializationError; use crate::heed_codec::StrBEU16Codec; use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::index_documents::helpers::sorter_into_reader; use crate::update::settings::InnerIndexSettingsDiff; use crate::update::MergeFn; use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result}; @@ -45,7 +46,7 @@ pub fn extract_word_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory, + max_memory.map(|m| m / 3), ); let mut key_buffer = Vec::new(); let mut del_words = BTreeSet::new(); @@ -93,25 +94,27 @@ pub fn extract_word_docids( tempfile::tempfile()?, ); - let mut word_docids_writer = create_writer( + let mut word_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, - tempfile::tempfile()?, + indexer.max_nb_chunks, + max_memory.map(|m| m / 3), ); - let mut exact_word_docids_writer = create_writer( + let mut exact_word_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, - tempfile::tempfile()?, + indexer.max_nb_chunks, + max_memory.map(|m| m / 3), ); - let mut word: Option = None; - let mut deletions = RoaringBitmap::new(); - let mut additions = RoaringBitmap::new(); - let mut exact_deletions = RoaringBitmap::new(); - let mut exact_additions = RoaringBitmap::new(); let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; - // TODO: replace sorters by writers by accumulating values into a buffer before inserting them. + let mut buffer = Vec::new(); + // NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters. while let Some((key, value)) = iter.next()? { // only keep the value if their is a change to apply in the DB. if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) { @@ -121,66 +124,36 @@ pub fn extract_word_docids( let (w, fid) = StrBEU16Codec::bytes_decode(key) .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - if let Some(current) = word.as_ref() { - if current != w { - docids_into_writers(¤t, &deletions, &additions, &mut word_docids_writer)?; - docids_into_writers( - ¤t, - &exact_deletions, - &exact_additions, - &mut exact_word_docids_writer, - )?; - word = Some(w.to_string()); - // clear buffers - deletions.clear(); - additions.clear(); - exact_deletions.clear(); - exact_additions.clear(); - } - } else { - word = Some(w.to_string()); - } - // merge all deletions let obkv = KvReaderDelAdd::new(value); if let Some(value) = obkv.get(DelAdd::Deletion) { let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid); - let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| { - SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) } - })?; + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + obkv.insert(DelAdd::Deletion, value)?; if delete_from_exact { - exact_deletions |= docids; + exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?; } else { - deletions |= docids + word_docids_sorter.insert(w, obkv.into_inner().unwrap())?; } } // merge all additions if let Some(value) = obkv.get(DelAdd::Addition) { let add_in_exact = settings_diff.new.exact_attributes.contains(&fid); - let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| { - SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) } - })?; + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + obkv.insert(DelAdd::Addition, value)?; if add_in_exact { - exact_additions |= docids; + exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?; } else { - additions |= docids + word_docids_sorter.insert(w, obkv.into_inner().unwrap())?; } } } - if let Some(word) = word { - docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer)?; - docids_into_writers( - &word, - &exact_deletions, - &exact_additions, - &mut exact_word_docids_writer, - )?; - } - Ok(( - writer_into_reader(word_docids_writer)?, - writer_into_reader(exact_word_docids_writer)?, + sorter_into_reader(word_docids_sorter, indexer)?, + sorter_into_reader(exact_word_docids_sorter, indexer)?, writer_into_reader(word_fid_docids_writer)?, )) } From 5ab901dd30da4cfd4d474dfd0e6793eea5ec4c9c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 4 Apr 2024 17:23:36 +0200 Subject: [PATCH 11/40] Fix tests --- .../src/snapshots/index_scheduler__tests__settings_update.snap | 1 + 1 file changed, 1 insertion(+) diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap index 85f0926b9..72a25f915 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap @@ -7,6 +7,7 @@ expression: task.details "default": { "source": "rest", "apiKey": "MyXXXX...", + "dimensions": 4, "url": "http://localhost:7777" } } From eaf113ef34c84e213dfcc96cdc2bc08837a45390 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 15 Apr 2024 11:28:30 +0200 Subject: [PATCH 12/40] Fix wod pair proximity error when nothing has to be extracted --- .../extract/extract_word_pair_proximity_docids.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index e185566ca..23f70ccd2 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -32,11 +32,12 @@ pub fn extract_word_pair_proximity_docids( // early return if the data shouldn't be deleted nor created. if !any_deletion && !any_addition { - return tempfile::tempfile() - .map_err(Into::into) - .map(BufReader::new) - .and_then(grenad::Reader::new) - .map_err(Into::into); + let writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + return writer_into_reader(writer); } let max_memory = indexer.max_memory_by_thread(); From 87a93ba47db3f1dc3b916bd686488306a4b489e5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 15 Apr 2024 13:30:41 +0200 Subject: [PATCH 13/40] fix clippy --- .../update/index_documents/extract/extract_vector_points.rs | 6 +++--- milli/src/update/index_documents/transform.rs | 2 +- milli/src/update/settings.rs | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index fc79a861f..23f945c7a 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -171,7 +171,7 @@ pub fn extract_vector_points( VectorStateDelta::NowGenerated(prompt.render( obkv, DelAdd::Addition, - &new_fields_ids_map, + new_fields_ids_map, )?) } else { VectorStateDelta::NowRemoved @@ -199,9 +199,9 @@ pub fn extract_vector_points( if document_is_kept { // Don't give up if the old prompt was failing let old_prompt = prompt - .render(obkv, DelAdd::Deletion, &old_fields_ids_map) + .render(obkv, DelAdd::Deletion, old_fields_ids_map) .unwrap_or_default(); - let new_prompt = prompt.render(obkv, DelAdd::Addition, &new_fields_ids_map)?; + let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; if old_prompt != new_prompt { tracing::trace!( "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index e82600683..90c3dbcc0 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -807,7 +807,7 @@ impl<'a, 'i> Transform<'a, 'i> { fst_new_external_documents_ids_builder.insert(key, value) })?; - let old_inner_settings = InnerIndexSettings::from_index(&self.index, wtxn)?; + let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?; let mut new_inner_settings = old_inner_settings.clone(); new_inner_settings.fields_ids_map = self.fields_ids_map; let settings_diff = InnerIndexSettingsDiff { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ae9ae2801..8ded6f03c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1044,7 +1044,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; - let old_inner_settings = InnerIndexSettings::from_index(&self.index, &self.wtxn)?; + let old_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; // never trigger re-indexing self.update_displayed()?; @@ -1078,7 +1078,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage let embedding_configs_updated = self.update_embedding_configs()?; - let new_inner_settings = InnerIndexSettings::from_index(&self.index, &self.wtxn)?; + let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; let inner_settings_diff = InnerIndexSettingsDiff { old: old_inner_settings, new: new_inner_settings, From a1ea224da97753e32edfe10eb4cffa203a3180f3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 16 Apr 2024 14:51:21 +0200 Subject: [PATCH 14/40] Fix tests --- .../snapshots/index_scheduler__tests__settings_update-2.snap | 1 + .../snapshots/index_scheduler__tests__settings_update-3.snap | 2 +- .../test_settings_update/after_registering_settings_task.snap | 2 +- .../lib.rs/test_settings_update/settings_update_processed.snap | 2 +- meilisearch/tests/settings/get_settings.rs | 1 + 5 files changed, 5 insertions(+), 3 deletions(-) diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap index 85f0926b9..72a25f915 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap @@ -7,6 +7,7 @@ expression: task.details "default": { "source": "rest", "apiKey": "MyXXXX...", + "dimensions": 4, "url": "http://localhost:7777" } } diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap index 50a42d678..f7ae1c00a 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap @@ -6,7 +6,7 @@ expression: embedding_config.embedder_options "Rest": { "api_key": "My super secret", "distribution": null, - "dimensions": null, + "dimensions": 4, "url": "http://localhost:7777", "query": null, "input_field": [ diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap index 205200965..f3b94fb3c 100644 --- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap +++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap index f6fb6a186..830331f61 100644 --- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap +++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index 042dcca41..cd31d4959 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -213,6 +213,7 @@ async fn secrets_are_hidden_in_settings() { "default": { "source": "rest", "apiKey": "My suXXXXXX...", + "dimensions": 4, "url": "https://localhost:7777" } } From 19137be0ea05c36bb5e9c5c88aec3b1a2af8a834 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 16 Apr 2024 18:09:49 +0200 Subject: [PATCH 15/40] increase the default search time budget from 150ms to 1.5s --- milli/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 22816787b..cd44f2f2e 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -128,7 +128,7 @@ impl fmt::Debug for TimeBudget { impl Default for TimeBudget { fn default() -> Self { - Self::new(std::time::Duration::from_millis(150)) + Self::new(std::time::Duration::from_millis(1500)) } } From 70ce0095ea3853d72d3817f4ef4cf8e2d61902cc Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 16 Apr 2024 18:48:03 +0200 Subject: [PATCH 16/40] remove the Version Seen analytic --- meilisearch/src/routes/mod.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 7cf886017..f2356fb7f 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -355,11 +355,7 @@ struct VersionResponse { async fn get_version( _index_scheduler: GuardedData, Data>, - req: HttpRequest, - analytics: web::Data, ) -> HttpResponse { - analytics.publish("Version Seen".to_string(), json!(null), Some(&req)); - let build_info = build_info::BuildInfo::from_build(); HttpResponse::Ok().json(VersionResponse { From abae31aee06874b62b215ff5fbef36337d94cbcb Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 16 Apr 2024 18:48:10 +0200 Subject: [PATCH 17/40] remove the Task Seen analytic --- meilisearch/src/analytics/mock_analytics.rs | 2 - meilisearch/src/analytics/mod.rs | 4 - .../src/analytics/segment_analytics.rs | 132 ------------------ meilisearch/src/routes/tasks.rs | 8 -- 4 files changed, 146 deletions(-) diff --git a/meilisearch/src/analytics/mock_analytics.rs b/meilisearch/src/analytics/mock_analytics.rs index 8fa1916a5..aa8989c3e 100644 --- a/meilisearch/src/analytics/mock_analytics.rs +++ b/meilisearch/src/analytics/mock_analytics.rs @@ -7,7 +7,6 @@ use serde_json::Value; use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind}; use crate::routes::indexes::documents::UpdateDocumentsQuery; -use crate::routes::tasks::TasksFilterQuery; use crate::Opt; pub struct MockAnalytics { @@ -86,6 +85,5 @@ impl Analytics for MockAnalytics { } fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} - fn get_tasks(&self, _query: &TasksFilterQuery, _request: &HttpRequest) {} fn health_seen(&self, _request: &HttpRequest) {} } diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index d21838dbb..d6e8f1449 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -14,7 +14,6 @@ use platform_dirs::AppDirs; use serde_json::Value; use crate::routes::indexes::documents::UpdateDocumentsQuery; -use crate::routes::tasks::TasksFilterQuery; // if the analytics feature is disabled // the `SegmentAnalytics` point to the mock instead of the real analytics @@ -118,9 +117,6 @@ pub trait Analytics: Sync + Send { request: &HttpRequest, ); - // this method should be called to aggregate the get tasks requests. - fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest); - // this method should be called to aggregate a add documents request fn health_seen(&self, request: &HttpRequest); } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index c49a04576..0dc471e78 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -33,7 +33,6 @@ use crate::option::{ }; use crate::routes::indexes::documents::UpdateDocumentsQuery; use crate::routes::indexes::facet_search::FacetSearchQuery; -use crate::routes::tasks::TasksFilterQuery; use crate::routes::{create_all_stats, Stats}; use crate::search::{ FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult, @@ -81,7 +80,6 @@ pub enum AnalyticsMsg { AggregateUpdateDocuments(DocumentsAggregator), AggregateGetFetchDocuments(DocumentsFetchAggregator), AggregatePostFetchDocuments(DocumentsFetchAggregator), - AggregateTasks(TasksAggregator), AggregateHealth(HealthAggregator), } @@ -152,7 +150,6 @@ impl SegmentAnalytics { update_documents_aggregator: DocumentsAggregator::default(), get_fetch_documents_aggregator: DocumentsFetchAggregator::default(), post_fetch_documents_aggregator: DocumentsFetchAggregator::default(), - get_tasks_aggregator: TasksAggregator::default(), health_aggregator: HealthAggregator::default(), }); tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone())); @@ -232,11 +229,6 @@ impl super::Analytics for SegmentAnalytics { let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate)); } - fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest) { - let aggregate = TasksAggregator::from_query(query, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateTasks(aggregate)); - } - fn health_seen(&self, request: &HttpRequest) { let aggregate = HealthAggregator::from_query(request); let _ = self.sender.try_send(AnalyticsMsg::AggregateHealth(aggregate)); @@ -394,7 +386,6 @@ pub struct Segment { update_documents_aggregator: DocumentsAggregator, get_fetch_documents_aggregator: DocumentsFetchAggregator, post_fetch_documents_aggregator: DocumentsFetchAggregator, - get_tasks_aggregator: TasksAggregator, health_aggregator: HealthAggregator, } @@ -458,7 +449,6 @@ impl Segment { Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateTasks(agreg)) => self.get_tasks_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregateHealth(agreg)) => self.health_aggregator.aggregate(agreg), None => (), } @@ -513,7 +503,6 @@ impl Segment { update_documents_aggregator, get_fetch_documents_aggregator, post_fetch_documents_aggregator, - get_tasks_aggregator, health_aggregator, } = self; @@ -562,9 +551,6 @@ impl Segment { { let _ = self.batcher.push(post_fetch_documents).await; } - if let Some(get_tasks) = take(get_tasks_aggregator).into_event(user, "Tasks Seen") { - let _ = self.batcher.push(get_tasks).await; - } if let Some(health) = take(health_aggregator).into_event(user, "Health Seen") { let _ = self.batcher.push(health).await; } @@ -1503,124 +1489,6 @@ impl DocumentsDeletionAggregator { } } -#[derive(Default, Serialize)] -pub struct TasksAggregator { - #[serde(skip)] - timestamp: Option, - - // context - #[serde(rename = "user-agent")] - user_agents: HashSet, - - filtered_by_uid: bool, - filtered_by_index_uid: bool, - filtered_by_type: bool, - filtered_by_status: bool, - filtered_by_canceled_by: bool, - filtered_by_before_enqueued_at: bool, - filtered_by_after_enqueued_at: bool, - filtered_by_before_started_at: bool, - filtered_by_after_started_at: bool, - filtered_by_before_finished_at: bool, - filtered_by_after_finished_at: bool, - total_received: usize, -} - -impl TasksAggregator { - pub fn from_query(query: &TasksFilterQuery, request: &HttpRequest) -> Self { - let TasksFilterQuery { - limit: _, - from: _, - uids, - index_uids, - types, - statuses, - canceled_by, - before_enqueued_at, - after_enqueued_at, - before_started_at, - after_started_at, - before_finished_at, - after_finished_at, - } = query; - - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - filtered_by_uid: uids.is_some(), - filtered_by_index_uid: index_uids.is_some(), - filtered_by_type: types.is_some(), - filtered_by_status: statuses.is_some(), - filtered_by_canceled_by: canceled_by.is_some(), - filtered_by_before_enqueued_at: before_enqueued_at.is_some(), - filtered_by_after_enqueued_at: after_enqueued_at.is_some(), - filtered_by_before_started_at: before_started_at.is_some(), - filtered_by_after_started_at: after_started_at.is_some(), - filtered_by_before_finished_at: before_finished_at.is_some(), - filtered_by_after_finished_at: after_finished_at.is_some(), - total_received: 1, - } - } - - /// Aggregate one [TasksAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { - timestamp, - user_agents, - total_received, - filtered_by_uid, - filtered_by_index_uid, - filtered_by_type, - filtered_by_status, - filtered_by_canceled_by, - filtered_by_before_enqueued_at, - filtered_by_after_enqueued_at, - filtered_by_before_started_at, - filtered_by_after_started_at, - filtered_by_before_finished_at, - filtered_by_after_finished_at, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - - self.filtered_by_uid |= filtered_by_uid; - self.filtered_by_index_uid |= filtered_by_index_uid; - self.filtered_by_type |= filtered_by_type; - self.filtered_by_status |= filtered_by_status; - self.filtered_by_canceled_by |= filtered_by_canceled_by; - self.filtered_by_before_enqueued_at |= filtered_by_before_enqueued_at; - self.filtered_by_after_enqueued_at |= filtered_by_after_enqueued_at; - self.filtered_by_before_started_at |= filtered_by_before_started_at; - self.filtered_by_after_started_at |= filtered_by_after_started_at; - self.filtered_by_before_finished_at |= filtered_by_before_finished_at; - self.filtered_by_after_finished_at |= filtered_by_after_finished_at; - self.filtered_by_after_finished_at |= filtered_by_after_finished_at; - - self.total_received = self.total_received.saturating_add(total_received); - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - // if we had no timestamp it means we never encountered any events and - // thus we don't need to send this event. - let timestamp = self.timestamp?; - - Some(Track { - timestamp: Some(timestamp), - user: user.clone(), - event: event_name.to_string(), - properties: serde_json::to_value(self).ok()?, - ..Default::default() - }) - } -} - #[derive(Default, Serialize)] pub struct HealthAggregator { #[serde(skip)] diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index f35d97fe6..a4cf68d57 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -270,12 +270,8 @@ pub struct AllTasks { async fn get_tasks( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, - req: HttpRequest, - analytics: web::Data, ) -> Result { let mut params = params.into_inner(); - analytics.get_tasks(¶ms, &req); - // We +1 just to know if there is more after this "page" or not. params.limit.0 = params.limit.0.saturating_add(1); let limit = params.limit.0; @@ -298,8 +294,6 @@ async fn get_tasks( async fn get_task( index_scheduler: GuardedData, Data>, task_uid: web::Path, - req: HttpRequest, - analytics: web::Data, ) -> Result { let task_uid_string = task_uid.into_inner(); @@ -310,8 +304,6 @@ async fn get_task( } }; - analytics.publish("Tasks Seen".to_string(), json!({ "per_task_uid": true }), Some(&req)); - let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() }; let filters = index_scheduler.filters(); let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(query, filters)?; From e1f27de51aca1a5c1187de196405d02810216e70 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 16 Apr 2024 18:49:41 +0200 Subject: [PATCH 18/40] remove the Stats Seen analytic --- meilisearch/src/routes/indexes/mod.rs | 4 ---- meilisearch/src/routes/mod.rs | 3 --- 2 files changed, 7 deletions(-) diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 59fa02dff..651977723 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -269,12 +269,8 @@ impl From for IndexStats { pub async fn get_index_stats( index_scheduler: GuardedData, Data>, index_uid: web::Path, - req: HttpRequest, - analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": true }), Some(&req)); - let stats = IndexStats::from(index_scheduler.index_stats(&index_uid)?); debug!(returns = ?stats, "Get index stats"); diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index f2356fb7f..92d4af8e2 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -296,10 +296,7 @@ pub struct Stats { async fn get_stats( index_scheduler: GuardedData, Data>, auth_controller: GuardedData, Data>, - req: HttpRequest, - analytics: web::Data, ) -> Result { - analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": false }), Some(&req)); let filters = index_scheduler.filters(); let stats = create_all_stats((*index_scheduler).clone(), (*auth_controller).clone(), filters)?; From 3acfab2eb784526c43beae1f8192bc9b2b4816be Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Apr 2024 10:54:48 +0200 Subject: [PATCH 19/40] Fix PR comments --- milli/src/update/index_documents/extract/mod.rs | 7 ++++--- milli/src/update/index_documents/mod.rs | 3 +++ milli/src/update/index_documents/transform.rs | 3 +++ milli/src/update/settings.rs | 1 - 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 341cdc9f9..bf533cfc9 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -11,6 +11,7 @@ mod extract_word_position_docids; use std::fs::File; use std::io::BufReader; +use std::sync::Arc; use crossbeam_channel::Sender; use rayon::prelude::*; @@ -43,7 +44,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx: Sender>, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, - settings_diff: &InnerIndexSettingsDiff, + settings_diff: &Arc, max_positions_per_attributes: Option, ) -> Result<()> { puffin::profile_function!(); @@ -180,7 +181,7 @@ pub(crate) fn data_from_obkv_documents( fn run_extraction_task( chunk: grenad::Reader, indexer: GrenadParameters, - settings_diff: &InnerIndexSettingsDiff, + settings_diff: &Arc, lmdb_writer_sx: Sender>, extract_fn: FE, serialize_fn: FS, @@ -221,7 +222,7 @@ fn send_original_documents_data( original_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, - settings_diff: &InnerIndexSettingsDiff, + settings_diff: &Arc, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 47f1e9f19..070f31c73 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -8,6 +8,7 @@ use std::collections::{HashMap, HashSet}; use std::io::{Read, Seek}; use std::num::NonZeroU32; use std::result::Result as StdResult; +use std::sync::Arc; use crossbeam_channel::{Receiver, Sender}; use grenad::{Merger, MergerBuilder}; @@ -292,6 +293,8 @@ where settings_diff.new.recompute_facets(self.wtxn, self.index)?; settings_diff.new.recompute_searchables(self.wtxn, self.index)?; + let settings_diff = Arc::new(settings_diff); + let backup_pool; let pool = match self.indexer_config.thread_pool { Some(ref pool) => pool, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 90c3dbcc0..8a3463e6f 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -829,6 +829,9 @@ impl<'a, 'i> Transform<'a, 'i> { }) } + /// Rebind the field_ids of the provided document to their values + /// based on the field_ids_maps difference between the old and the new settings, + /// then fill the provided buffers with delta documents using KvWritterDelAdd. fn rebind_existing_document( old_obkv: KvReader, settings_diff: &InnerIndexSettingsDiff, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 8ded6f03c..1997e966e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1094,7 +1094,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } -#[derive(Clone)] pub struct InnerIndexSettingsDiff { pub(crate) old: InnerIndexSettings, pub(crate) new: InnerIndexSettings, From 2dd9dd6d0ae09f4970737e3a9eff83205f1f177f Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 17 Apr 2024 11:43:40 +0200 Subject: [PATCH 20/40] remove the Health Seen analytic --- meilisearch/src/analytics/mock_analytics.rs | 1 - meilisearch/src/analytics/mod.rs | 3 - .../src/analytics/segment_analytics.rs | 65 ------------------- meilisearch/src/routes/mod.rs | 6 -- 4 files changed, 75 deletions(-) diff --git a/meilisearch/src/analytics/mock_analytics.rs b/meilisearch/src/analytics/mock_analytics.rs index aa8989c3e..1687e9e19 100644 --- a/meilisearch/src/analytics/mock_analytics.rs +++ b/meilisearch/src/analytics/mock_analytics.rs @@ -85,5 +85,4 @@ impl Analytics for MockAnalytics { } fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} - fn health_seen(&self, _request: &HttpRequest) {} } diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index d6e8f1449..09c0a05df 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -116,7 +116,4 @@ pub trait Analytics: Sync + Send { index_creation: bool, request: &HttpRequest, ); - - // this method should be called to aggregate a add documents request - fn health_seen(&self, request: &HttpRequest); } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 0dc471e78..8c20c82c2 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -80,7 +80,6 @@ pub enum AnalyticsMsg { AggregateUpdateDocuments(DocumentsAggregator), AggregateGetFetchDocuments(DocumentsFetchAggregator), AggregatePostFetchDocuments(DocumentsFetchAggregator), - AggregateHealth(HealthAggregator), } pub struct SegmentAnalytics { @@ -150,7 +149,6 @@ impl SegmentAnalytics { update_documents_aggregator: DocumentsAggregator::default(), get_fetch_documents_aggregator: DocumentsFetchAggregator::default(), post_fetch_documents_aggregator: DocumentsFetchAggregator::default(), - health_aggregator: HealthAggregator::default(), }); tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone())); @@ -228,11 +226,6 @@ impl super::Analytics for SegmentAnalytics { let aggregate = DocumentsFetchAggregator::from_query(documents_query, request); let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate)); } - - fn health_seen(&self, request: &HttpRequest) { - let aggregate = HealthAggregator::from_query(request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateHealth(aggregate)); - } } /// This structure represent the `infos` field we send in the analytics. @@ -386,7 +379,6 @@ pub struct Segment { update_documents_aggregator: DocumentsAggregator, get_fetch_documents_aggregator: DocumentsFetchAggregator, post_fetch_documents_aggregator: DocumentsFetchAggregator, - health_aggregator: HealthAggregator, } impl Segment { @@ -449,7 +441,6 @@ impl Segment { Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateHealth(agreg)) => self.health_aggregator.aggregate(agreg), None => (), } } @@ -503,7 +494,6 @@ impl Segment { update_documents_aggregator, get_fetch_documents_aggregator, post_fetch_documents_aggregator, - health_aggregator, } = self; if let Some(get_search) = @@ -551,9 +541,6 @@ impl Segment { { let _ = self.batcher.push(post_fetch_documents).await; } - if let Some(health) = take(health_aggregator).into_event(user, "Health Seen") { - let _ = self.batcher.push(health).await; - } let _ = self.batcher.flush().await; } } @@ -1489,58 +1476,6 @@ impl DocumentsDeletionAggregator { } } -#[derive(Default, Serialize)] -pub struct HealthAggregator { - #[serde(skip)] - timestamp: Option, - - // context - #[serde(rename = "user-agent")] - user_agents: HashSet, - - #[serde(rename = "requests.total_received")] - total_received: usize, -} - -impl HealthAggregator { - pub fn from_query(request: &HttpRequest) -> Self { - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - total_received: 1, - } - } - - /// Aggregate one [HealthAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { timestamp, user_agents, total_received } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - self.total_received = self.total_received.saturating_add(total_received); - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - // if we had no timestamp it means we never encountered any events and - // thus we don't need to send this event. - let timestamp = self.timestamp?; - - Some(Track { - timestamp: Some(timestamp), - user: user.clone(), - event: event_name.to_string(), - properties: serde_json::to_value(self).ok()?, - ..Default::default() - }) - } -} - #[derive(Default, Serialize)] pub struct DocumentsFetchAggregator { #[serde(skip)] diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 92d4af8e2..0e6ab7765 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -8,11 +8,9 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::settings::{Settings, Unchecked}; use meilisearch_types::tasks::{Kind, Status, Task, TaskId}; use serde::{Deserialize, Serialize}; -use serde_json::json; use time::OffsetDateTime; use tracing::debug; -use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::search_queue::SearchQueue; @@ -376,14 +374,10 @@ struct KeysResponse { } pub async fn get_health( - req: HttpRequest, index_scheduler: Data, auth_controller: Data, search_queue: Data, - analytics: web::Data, ) -> Result { - analytics.health_seen(&req); - search_queue.health().unwrap(); index_scheduler.health().unwrap(); auth_controller.health().unwrap(); From df29ba709a1d016fb000182fce116457d5f5b403 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Apr 2024 12:33:25 +0200 Subject: [PATCH 21/40] Make some cleaning in Arcs --- .../src/update/index_documents/extract/mod.rs | 29 +++++++++---------- milli/src/update/index_documents/mod.rs | 2 +- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index bf533cfc9..bc6fe2aff 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -44,7 +44,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx: Sender>, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, - settings_diff: &Arc, + settings_diff: Arc, max_positions_per_attributes: Option, ) -> Result<()> { puffin::profile_function!(); @@ -58,7 +58,7 @@ pub(crate) fn data_from_obkv_documents( original_documents_chunk, indexer, lmdb_writer_sx.clone(), - settings_diff, + settings_diff.clone(), ) }) .collect::>() @@ -73,7 +73,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), primary_key_id, geo_fields_ids, - settings_diff, + settings_diff.clone(), max_positions_per_attributes, ) }) @@ -86,7 +86,7 @@ pub(crate) fn data_from_obkv_documents( run_extraction_task::<_, _, grenad::Reader>>( docid_word_positions_chunk.clone(), indexer, - settings_diff, + settings_diff.clone(), lmdb_writer_sx.clone(), extract_fid_word_count_docids, TypedChunk::FieldIdWordCountDocids, @@ -103,7 +103,7 @@ pub(crate) fn data_from_obkv_documents( >( docid_word_positions_chunk.clone(), indexer, - settings_diff, + settings_diff.clone(), lmdb_writer_sx.clone(), extract_word_docids, |( @@ -123,7 +123,7 @@ pub(crate) fn data_from_obkv_documents( run_extraction_task::<_, _, grenad::Reader>>( docid_word_positions_chunk.clone(), indexer, - settings_diff, + settings_diff.clone(), lmdb_writer_sx.clone(), extract_word_position_docids, TypedChunk::WordPositionDocids, @@ -137,7 +137,7 @@ pub(crate) fn data_from_obkv_documents( >( fid_docid_facet_strings_chunk.clone(), indexer, - settings_diff, + settings_diff.clone(), lmdb_writer_sx.clone(), extract_facet_string_docids, TypedChunk::FieldIdFacetStringDocids, @@ -147,7 +147,7 @@ pub(crate) fn data_from_obkv_documents( run_extraction_task::<_, _, grenad::Reader>>( fid_docid_facet_numbers_chunk.clone(), indexer, - settings_diff, + settings_diff.clone(), lmdb_writer_sx.clone(), extract_facet_number_docids, TypedChunk::FieldIdFacetNumberDocids, @@ -157,7 +157,7 @@ pub(crate) fn data_from_obkv_documents( run_extraction_task::<_, _, grenad::Reader>>( docid_word_positions_chunk.clone(), indexer, - settings_diff, + settings_diff.clone(), lmdb_writer_sx.clone(), extract_word_pair_proximity_docids, TypedChunk::WordPairProximityDocids, @@ -181,7 +181,7 @@ pub(crate) fn data_from_obkv_documents( fn run_extraction_task( chunk: grenad::Reader, indexer: GrenadParameters, - settings_diff: &Arc, + settings_diff: Arc, lmdb_writer_sx: Sender>, extract_fn: FE, serialize_fn: FS, @@ -199,7 +199,6 @@ fn run_extraction_task( M: Send, { let current_span = tracing::Span::current(); - let settings_diff = settings_diff.clone(); rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks"); @@ -222,7 +221,7 @@ fn send_original_documents_data( original_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, - settings_diff: &Arc, + settings_diff: Arc, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -302,7 +301,7 @@ fn send_and_extract_flattened_documents_data( lmdb_writer_sx: Sender>, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, - settings_diff: &InnerIndexSettingsDiff, + settings_diff: Arc, max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, @@ -331,7 +330,7 @@ fn send_and_extract_flattened_documents_data( extract_docid_word_positions( flattened_documents_chunk.clone(), indexer, - settings_diff, + &settings_diff, max_positions_per_attributes, )?; @@ -354,7 +353,7 @@ fn send_and_extract_flattened_documents_data( } = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), indexer, - settings_diff, + &settings_diff, geo_fields_ids, )?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 070f31c73..aa9789a1a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -412,7 +412,7 @@ where lmdb_writer_sx.clone(), primary_key_id, geo_fields_ids, - &settings_diff, + settings_diff.clone(), max_positions_per_attributes, ) }); From 206887c7a2994b8b59a49c43d96ae7b0559e06d2 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 17 Apr 2024 12:57:19 +0200 Subject: [PATCH 22/40] =?UTF-8?q?update=20the=20SearchQuery=20Debug=20impl?= =?UTF-8?q?ementation=20so=20it=E2=80=99s=20smaller=20and=20gives=20the=20?= =?UTF-8?q?most=20important=20informations=20first?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- meilisearch/src/search.rs | 107 +++++++++++++++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 47c2b5f4b..001ad1ebd 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1,3 +1,4 @@ +use core::fmt; use std::cmp::min; use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::str::FromStr; @@ -39,7 +40,7 @@ pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "".to_string(); pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "".to_string(); pub const DEFAULT_SEMANTIC_RATIO: fn() -> SemanticRatio = || SemanticRatio(0.5); -#[derive(Debug, Clone, Default, PartialEq, Deserr)] +#[derive(Clone, Default, PartialEq, Deserr)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub struct SearchQuery { #[deserr(default, error = DeserrJsonError)] @@ -88,6 +89,110 @@ pub struct SearchQuery { pub attributes_to_search_on: Option>, } +// Since this structure is logged A LOT we're going to reduce the number of things it logs to the bare minimum. +// - Only what IS used, we know everything else is set to None so there is no need to print it +// - Re-order the most important field to debug first +impl fmt::Debug for SearchQuery { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + q, + vector, + hybrid, + offset, + limit, + page, + hits_per_page, + attributes_to_retrieve, + attributes_to_crop, + crop_length, + attributes_to_highlight, + show_matches_position, + show_ranking_score, + show_ranking_score_details, + filter, + sort, + facets, + highlight_pre_tag, + highlight_post_tag, + crop_marker, + matching_strategy, + attributes_to_search_on, + } = self; + + let mut debug = f.debug_struct("SearchQuery"); + + // First, everything related to the number of documents to retrieve + debug.field("limit", &limit).field("offset", &offset); + if let Some(page) = page { + debug.field("page", &page); + } + if let Some(hits_per_page) = hits_per_page { + debug.field("hits_per_page", &hits_per_page); + } + + // Then, everything related to the queries + if let Some(q) = q { + debug.field("q", &q); + } + if let Some(v) = vector { + if v.len() < 10 { + debug.field("vector", &v); + } else { + debug.field( + "vector", + &format!("[{}, {}, {}, ... {} dimensions]", v[0], v[1], v[2], v.len()), + ); + } + } + if let Some(hybrid) = hybrid { + debug.field("hybrid", &hybrid); + } + if let Some(attributes_to_search_on) = attributes_to_search_on { + debug.field("attributes_to_search_on", &attributes_to_search_on); + } + if let Some(filter) = filter { + debug.field("filter", &filter); + } + if let Some(sort) = sort { + debug.field("sort", &sort); + } + if let Some(facets) = facets { + debug.field("facets", &facets); + } + debug.field("matching_strategy", &matching_strategy); + + // Then everything related to the formatting + debug.field("crop_length", &crop_length); + if *show_matches_position { + debug.field("show_matches_position", show_matches_position); + } + if *show_ranking_score { + debug.field("show_ranking_score", show_ranking_score); + } + if *show_ranking_score_details { + debug.field("self.show_ranking_score_details", show_ranking_score_details); + } + debug.field("crop_length", &crop_length); + if let Some(facets) = facets { + debug.field("facets", &facets); + } + if let Some(attributes_to_retrieve) = attributes_to_retrieve { + debug.field("attributes_to_retrieve", &attributes_to_retrieve); + } + if let Some(attributes_to_crop) = attributes_to_crop { + debug.field("attributes_to_crop", &attributes_to_crop); + } + if let Some(attributes_to_highlight) = attributes_to_highlight { + debug.field("attributes_to_highlight", &attributes_to_highlight); + } + debug.field("highlight_pre_tag", &highlight_pre_tag); + debug.field("highlight_post_tag", &highlight_post_tag); + debug.field("crop_marker", &crop_marker); + + debug.finish() + } +} + #[derive(Debug, Clone, Default, PartialEq, Deserr)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub struct HybridQuery { From 4a68e9f6aee395e321d4a0a188eb47d112ebb47f Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 17 Apr 2024 13:42:10 +0200 Subject: [PATCH 23/40] reorganize the debug implementation of the search results and only dispaly the meaningful informations --- meilisearch/src/search.rs | 42 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 001ad1ebd..a383434a2 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -475,7 +475,7 @@ pub struct SearchHit { pub ranking_score_details: Option>, } -#[derive(Serialize, Debug, Clone, PartialEq)] +#[derive(Serialize, Clone, PartialEq)] #[serde(rename_all = "camelCase")] pub struct SearchResult { pub hits: Vec, @@ -498,6 +498,46 @@ pub struct SearchResult { pub used_negative_operator: bool, } +impl fmt::Debug for SearchResult { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let SearchResult { + hits, + query, + processing_time_ms, + hits_info, + facet_distribution, + facet_stats, + semantic_hit_count, + degraded, + used_negative_operator, + } = self; + + let mut debug = f.debug_struct("SearchResult"); + // The most important thing when looking at a search result is the time it took to process + debug.field("processing_time_ms", &processing_time_ms); + debug.field("hits", &format!("[{} hits returned]", hits.len())); + debug.field("query", &query); + debug.field("hits_info", &hits_info); + if *used_negative_operator { + debug.field("used_negative_operator", used_negative_operator); + } + if *degraded { + debug.field("degraded", degraded); + } + if let Some(facet_distribution) = facet_distribution { + debug.field("facet_distribution", &facet_distribution); + } + if let Some(facet_stats) = facet_stats { + debug.field("facet_stats", &facet_stats); + } + if let Some(semantic_hit_count) = semantic_hit_count { + debug.field("semantic_hit_count", &semantic_hit_count); + } + + debug.finish() + } +} + #[derive(Serialize, Debug, Clone, PartialEq)] #[serde(rename_all = "camelCase")] pub struct SearchResultWithIndex { From c923adf222f55f8d07cbd8cff7f613dc8d30eb69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Apr 2024 16:26:04 +0200 Subject: [PATCH 24/40] Fix facet distribution for alpha on facet numbers --- milli/src/search/facet/facet_distribution.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 90da16797..09ef5d4ee 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -97,6 +97,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { match facet_type { FacetType::Number => { + let mut lexicographic_distribution = BTreeMap::new(); let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); let distribution_prelength = distribution.len(); @@ -111,14 +112,17 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, _, value), ()) = result?; - *distribution.entry(value.to_string()).or_insert(0) += 1; + *lexicographic_distribution.entry(value.to_string()).or_insert(0) += 1; - if distribution.len() - distribution_prelength == self.max_values_per_facet + if lexicographic_distribution.len() - distribution_prelength + == self.max_values_per_facet { break; } } } + + distribution.extend(lexicographic_distribution); } FacetType::String => { let mut normalized_distribution = BTreeMap::new(); From c71b5d09ff9e607b477592d58cfc732e672c5677 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 18 Apr 2024 11:38:26 +0200 Subject: [PATCH 25/40] Updatre charabia v0.8.9 --- Cargo.lock | 247 ++++++++++++++++++++++++++++------- meilisearch-types/Cargo.toml | 1 + meilisearch/Cargo.toml | 1 + milli/Cargo.toml | 3 +- 4 files changed, 205 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 70ebc71b9..e2ddfd2ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -354,9 +354,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.80" +version = "1.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" +checksum = "f538837af36e6f6a9be0faa67f9a314f8119e4e4b5867c6ab40ed60360142519" dependencies = [ "backtrace", ] @@ -889,9 +889,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60dc1a562fc8cb53d552d371758a4ecd76d15cc7489d2b968529cd9cadcbd854" +checksum = "f6a65052f308636e5d5e1777f0dbc07919f5fbac24b6c8ad3e140472e5520de9" dependencies = [ "aho-corasick", "cow-utils", @@ -901,9 +901,7 @@ dependencies = [ "fst", "irg-kvariants", "jieba-rs", - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", + "lindera", "litemap", "once_cell", "pinyin", @@ -1715,9 +1713,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" dependencies = [ "anstream", "anstyle", @@ -2661,6 +2659,15 @@ dependencies = [ "simple_asn1", ] +[[package]] +name = "kanaria" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "kstring" version = "2.0.0" @@ -2766,10 +2773,67 @@ dependencies = [ ] [[package]] -name = "lindera-cc-cedict-builder" -version = "0.28.0" +name = "lindera" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca21f2ee3ca40e7f3ebbd568d041be1531c2c28dbf540e737aeba934ab53f330" +checksum = "a1bbf252ea3490053dc397539ece0b510924f2f72605fa28d3e858d86f43ec88" +dependencies = [ + "lindera-analyzer", + "lindera-core", + "lindera-dictionary", + "lindera-filter", + "lindera-tokenizer", +] + +[[package]] +name = "lindera-analyzer" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87febfec0e2859ce2154fb90dd6f66b774ddb0b6e264b44f8e3d1303c9dcedd7" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "encoding", + "kanaria", + "lindera-cc-cedict-builder", + "lindera-core", + "lindera-dictionary", + "lindera-filter", + "lindera-ipadic-builder", + "lindera-ko-dic-builder", + "lindera-tokenizer", + "lindera-unidic-builder", + "once_cell", + "regex", + "serde", + "serde_json", + "thiserror", + "unicode-blocks", + "unicode-normalization", + "unicode-segmentation", + "yada", +] + +[[package]] +name = "lindera-cc-cedict" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb91bb8a93ab0f95dbc3c43b5105354bb059134ef731154f75a64b5d919e71d" +dependencies = [ + "bincode", + "byteorder", + "lindera-cc-cedict-builder", + "lindera-core", + "lindera-decompress", + "once_cell", +] + +[[package]] +name = "lindera-cc-cedict-builder" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6022a8309a287dbef425fd09a61585351670c83001d74f6c089979e2330b683" dependencies = [ "anyhow", "bincode", @@ -2778,6 +2842,7 @@ dependencies = [ "encoding", "env_logger", "glob", + "lindera-compress", "lindera-core", "lindera-decompress", "log", @@ -2786,9 +2851,9 @@ dependencies = [ [[package]] name = "lindera-compress" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34da125091f3b3a49351f418484a16cb2a23f6888cd53fe219edad19d263da5d" +checksum = "32363cbcf433f915e7d77c2a0c410db2d6b23442e80715cf2cf6b9864078a500" dependencies = [ "anyhow", "flate2", @@ -2797,9 +2862,9 @@ dependencies = [ [[package]] name = "lindera-core" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09d4b717a8a31b73a3cbd3552e0abda14e0c85d97dc8b911035342533defdbad" +checksum = "d9a0e858753a02b1a3524fae4fbb11ca4b3a947128fd7854b797386562678be8" dependencies = [ "anyhow", "bincode", @@ -2814,9 +2879,9 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98f4476c99cb4ffa54fbfc42953adf69ada7276cfbb594bce9829547de012058" +checksum = "0e406345f6f8b665b9a129c67079c18ca9d97e9d171d102b4106a64a592c285e" dependencies = [ "anyhow", "flate2", @@ -2825,29 +2890,73 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45b92f0ce331c2202c6cec3135e4bfce29525ab3bb97a613c27c8e0a29fa967" +checksum = "3e2a3ec0e5fd6768a27c6ec1040e8470d3a5926418f7afe065859e98aabb3bfe" dependencies = [ "anyhow", "bincode", "byteorder", + "lindera-cc-cedict", "lindera-cc-cedict-builder", "lindera-core", + "lindera-ipadic", "lindera-ipadic-builder", + "lindera-ipadic-neologd", "lindera-ipadic-neologd-builder", "lindera-ko-dic", "lindera-ko-dic-builder", "lindera-unidic", "lindera-unidic-builder", "serde", + "strum", + "strum_macros", +] + +[[package]] +name = "lindera-filter" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1badaf51bad051185ea4917ba91bbbf2d6f8167e155647e21e0eaaef0982a95d" +dependencies = [ + "anyhow", + "csv", + "kanaria", + "lindera-cc-cedict-builder", + "lindera-core", + "lindera-dictionary", + "lindera-ipadic-builder", + "lindera-ko-dic-builder", + "lindera-unidic-builder", + "once_cell", + "regex", + "serde", + "serde_json", + "unicode-blocks", + "unicode-normalization", + "unicode-segmentation", + "yada", +] + +[[package]] +name = "lindera-ipadic" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "129ec16366354998f9791467ad38731539197747f649e573ead845358271ce25" +dependencies = [ + "bincode", + "byteorder", + "lindera-core", + "lindera-decompress", + "lindera-ipadic-builder", + "once_cell", ] [[package]] name = "lindera-ipadic-builder" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "642dee52201852df209cb43423ff1ca4d161a329f5cdba049a7b5820118345f2" +checksum = "7f0979a56bc57e9c9be2996dff232c47aa146a2e7baebf5dd567e388eba3dd90" dependencies = [ "anyhow", "bincode", @@ -2857,6 +2966,7 @@ dependencies = [ "encoding_rs_io", "env_logger", "glob", + "lindera-compress", "lindera-core", "lindera-decompress", "log", @@ -2865,10 +2975,24 @@ dependencies = [ ] [[package]] -name = "lindera-ipadic-neologd-builder" -version = "0.28.0" +name = "lindera-ipadic-neologd" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "325144b154e68159373e944d1cd7f67c6ff9965a2af41240a8e41732b3fdb3af" +checksum = "20076660c4e79ef0316735b44e18ec7644e54786acdee8946c972d5f97086d0f" +dependencies = [ + "bincode", + "byteorder", + "lindera-core", + "lindera-decompress", + "lindera-ipadic-neologd-builder", + "once_cell", +] + +[[package]] +name = "lindera-ipadic-neologd-builder" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eccd18ed5f65d1d64ac0cbfa1d6827bfbbaf6530520ae6847e6a91ee38f47e20" dependencies = [ "anyhow", "bincode", @@ -2878,6 +3002,7 @@ dependencies = [ "encoding_rs_io", "env_logger", "glob", + "lindera-compress", "lindera-core", "lindera-decompress", "log", @@ -2887,9 +3012,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b484a2f9964e7424264fda304beb6ff6ad883c347accfe1115e777dedef3661d" +checksum = "59073171566c3e498ca048e84c2d0a7e117a42f36c8eb7d7163e65ac38bd6d48" dependencies = [ "bincode", "byteorder", @@ -2900,13 +3025,14 @@ dependencies = [ "lindera-ko-dic-builder", "once_cell", "tar", + "ureq", ] [[package]] name = "lindera-ko-dic-builder" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9413d4d9bf7af921f5ac64414a290c7ba81695e8ba08dd2f6c950b57c281a69" +checksum = "ae176afa8535ca2a5ee9471873f85d531db0a6c32a3c42b41084506aac22b577" dependencies = [ "anyhow", "bincode", @@ -2924,9 +3050,9 @@ dependencies = [ [[package]] name = "lindera-tokenizer" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9987c818462d51ca67e131e40f0386e25e8c557e195059b1257f95731561185d" +checksum = "457285bdde84571aa510c9e05371904305a55e8a541fa1473d4393062f06932d" dependencies = [ "bincode", "lindera-core", @@ -2938,26 +3064,27 @@ dependencies = [ [[package]] name = "lindera-unidic" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c379cf436b2627cd7d3498642e491eadbff9b3e01231c516ce9f9b1893ab7c3" +checksum = "5839980be552dfa639b70964c61914a9ad014148663679b0e148aa72e5e30f23" dependencies = [ "bincode", "byteorder", "encoding", + "flate2", "lindera-core", "lindera-decompress", "lindera-unidic-builder", "once_cell", + "tar", "ureq", - "zip", ] [[package]] name = "lindera-unidic-builder" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601ec33b5174141396a7a4ca066278863840221fec32d0be19091e7fae91ed94" +checksum = "dcaab8f061d5b944b1e424f49c7efbf8f276e8a72e4f4ff956d01e46d481f008" dependencies = [ "anyhow", "bincode", @@ -4214,9 +4341,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.2" +version = "1.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" dependencies = [ "aho-corasick", "memchr", @@ -4226,9 +4353,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", @@ -4795,6 +4922,28 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strum" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.58", +] + [[package]] name = "subtle" version = "2.5.0" @@ -5324,6 +5473,12 @@ version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +[[package]] +name = "unicode-blocks" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" + [[package]] name = "unicode-ident" version = "1.0.12" @@ -5332,9 +5487,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" dependencies = [ "tinyvec", ] @@ -5350,9 +5505,9 @@ dependencies = [ [[package]] name = "unicode-segmentation" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" [[package]] name = "unicode-width" @@ -5942,9 +6097,9 @@ dependencies = [ [[package]] name = "yada" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6d12cb7a57bbf2ab670ed9545bae3648048547f9039279a89ce000208e585c1" +checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" [[package]] name = "yaml-rust" diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index 7709d33d7..1973a2034 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -44,6 +44,7 @@ all-tokenizations = ["milli/all-tokenizations"] # chinese specialized tokenization chinese = ["milli/chinese"] +chinese-pinyin = ["milli/chinese-pinyin"] # hebrew specialized tokenization hebrew = ["milli/hebrew"] # japanese specialized tokenization diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 04b919904..6b8db4144 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -149,6 +149,7 @@ mini-dashboard = [ "zip", ] chinese = ["meilisearch-types/chinese"] +chinese-pinyin = ["meilisearch-types/chinese-pinyin"] hebrew = ["meilisearch-types/hebrew"] japanese = ["meilisearch-types/japanese"] thai = ["meilisearch-types/thai"] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 08dbce869..9423a854e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ bincode = "1.3.3" bstr = "1.9.0" bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.8.8", default-features = false } +charabia = { version = "0.8.9", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.11" deserr = "0.6.1" @@ -115,6 +115,7 @@ lmdb-posix-sem = ["heed/posix-sem"] # allow chinese specialized tokenization chinese = ["charabia/chinese"] +chinese-pinyin = ["chinese", "charabia/chinese-normalization-pinyin"] # allow hebrew specialized tokenization hebrew = ["charabia/hebrew"] From a1aa99902607bae6df318d71ecc885cf2cadb6bb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 22 Apr 2024 14:18:35 +0200 Subject: [PATCH 26/40] Add conditions reducing wrok --- .../extract/extract_vector_points.rs | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 23f945c7a..8a7bcb1f9 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -198,11 +198,16 @@ pub fn extract_vector_points( if document_is_kept { // Don't give up if the old prompt was failing - let old_prompt = prompt - .render(obkv, DelAdd::Deletion, old_fields_ids_map) - .unwrap_or_default(); + let old_prompt = Some(prompt) + // TODO: this filter works because we erase the vec database when a embedding setting changes. + // When vector pipeline will be optimized, this should be removed. + .filter(|_| !settings_diff.reindex_vectors()) + .map(|p| { + p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() + }); let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt != new_prompt { + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); tracing::trace!( "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" ); @@ -224,6 +229,7 @@ pub fn extract_vector_points( &mut manual_vectors_writer, &mut key_buffer, delta, + settings_diff, )?; } @@ -264,10 +270,15 @@ fn push_vectors_diff( manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, + settings_diff: &InnerIndexSettingsDiff, ) -> Result<()> { puffin::profile_function!(); let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); - if must_remove { + if must_remove + // TODO: the below condition works because we erase the vec database when a embedding setting changes. + // When vector pipeline will be optimized, this should be removed. + && !settings_diff.reindex_vectors() + { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; } @@ -295,12 +306,16 @@ fn push_vectors_diff( match eob { EitherOrBoth::Both(_, _) => (), // no need to touch anything EitherOrBoth::Left(vector) => { - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; + // TODO: the below condition works because we erase the vec database when a embedding setting changes. + // When vector pipeline will be optimized, this should be removed. + if !settings_diff.reindex_vectors() { + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + manual_vectors_writer.insert(&key_buffer, bytes)?; + } } EitherOrBoth::Right(vector) => { // We insert only the Add part of the Obkv to inform From 0c7003c5df5deff109806ba212d4af779f59e4e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 22 Apr 2024 16:25:58 +0200 Subject: [PATCH 27/40] Introduce an atomic to catch panics in thread pools --- meilisearch/src/option.rs | 9 +++++++++ milli/src/update/index_documents/mod.rs | 25 +++++++++++++++---------- milli/src/update/indexer_config.rs | 7 +++++++ 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 651af7336..d3d9150d6 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -6,6 +6,7 @@ use std::num::ParseIntError; use std::ops::Deref; use std::path::PathBuf; use std::str::FromStr; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::{env, fmt, fs}; @@ -666,15 +667,23 @@ impl TryFrom<&IndexerOpts> for IndexerConfig { type Error = anyhow::Error; fn try_from(other: &IndexerOpts) -> Result { + let pool_panic_catched = Arc::new(AtomicBool::new(false)); let thread_pool = rayon::ThreadPoolBuilder::new() .thread_name(|index| format!("indexing-thread:{index}")) .num_threads(*other.max_indexing_threads) + .panic_handler({ + // TODO What should we do with this Box. + // So, let's just set a value to true to cancel the task with a message for now. + let panic_cathed = pool_panic_catched.clone(); + move |_result| panic_cathed.store(true, Ordering::SeqCst) + }) .build()?; Ok(Self { log_every_n: Some(DEFAULT_LOG_EVERY_N), max_memory: other.max_indexing_memory.map(|b| b.get_bytes() as usize), thread_pool: Some(thread_pool), + pool_panic_catched, max_positions_per_attributes: None, skip_index_budget: other.skip_index_budget, ..Default::default() diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index aa9789a1a..3b27f96f4 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -8,6 +8,7 @@ use std::collections::{HashMap, HashSet}; use std::io::{Read, Seek}; use std::num::NonZeroU32; use std::result::Result as StdResult; +use std::sync::atomic::Ordering; use std::sync::Arc; use crossbeam_channel::{Receiver, Sender}; @@ -296,20 +297,24 @@ where let settings_diff = Arc::new(settings_diff); let backup_pool; + let pool_catched_panic = self.indexer_config.pool_panic_catched.clone(); let pool = match self.indexer_config.thread_pool { Some(ref pool) => pool, - #[cfg(not(test))] None => { - // We initialize a bakcup pool with the default + // We initialize a backup pool with the default // settings if none have already been set. - backup_pool = rayon::ThreadPoolBuilder::new().build()?; - &backup_pool - } - #[cfg(test)] - None => { - // We initialize a bakcup pool with the default - // settings if none have already been set. - backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?; + let mut pool_builder = rayon::ThreadPoolBuilder::new(); + pool_builder = pool_builder.panic_handler({ + let catched_panic = pool_catched_panic.clone(); + move |_result| catched_panic.store(true, Ordering::SeqCst) + }); + + #[cfg(test)] + { + pool_builder = pool_builder.num_threads(1); + } + + backup_pool = pool_builder.build()?; &backup_pool } }; diff --git a/milli/src/update/indexer_config.rs b/milli/src/update/indexer_config.rs index ff7942fdb..b23d8e700 100644 --- a/milli/src/update/indexer_config.rs +++ b/milli/src/update/indexer_config.rs @@ -1,3 +1,6 @@ +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + use grenad::CompressionType; use rayon::ThreadPool; @@ -10,6 +13,9 @@ pub struct IndexerConfig { pub chunk_compression_type: CompressionType, pub chunk_compression_level: Option, pub thread_pool: Option, + /// Set to true if the thread pool catched a panic + /// and we must abort the task + pub pool_panic_catched: Arc, pub max_positions_per_attributes: Option, pub skip_index_budget: bool, } @@ -24,6 +30,7 @@ impl Default for IndexerConfig { chunk_compression_type: CompressionType::None, chunk_compression_level: None, thread_pool: None, + pool_panic_catched: Arc::default(), max_positions_per_attributes: None, skip_index_budget: false, } From 96cc5319c89b4743cfb406838d57b126b038b918 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 22 Apr 2024 15:46:16 +0200 Subject: [PATCH 28/40] Introduce a new internal error type to categorize panics --- milli/src/error.rs | 2 ++ milli/src/update/index_documents/mod.rs | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/milli/src/error.rs b/milli/src/error.rs index 1d61bef63..a5ed47c7d 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -49,6 +49,8 @@ pub enum InternalError { InvalidDatabaseTyping, #[error(transparent)] RayonThreadPool(#[from] ThreadPoolBuildError), + #[error("A panic occured. Read the logs to find more information about it.")] + PanicInThreadPool, #[error(transparent)] SerdeJson(#[from] serde_json::Error), #[error(transparent)] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3b27f96f4..c61c83757 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -540,6 +540,11 @@ where Ok(()) })?; + // While reseting the pool panic catcher we return an error if we catched one. + if pool_catched_panic.swap(false, Ordering::SeqCst) { + return Err(InternalError::PanicInThreadPool.into()); + } + // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; @@ -568,6 +573,11 @@ where } Result::Ok(()) })?; + + // While reseting the pool panic catcher we return an error if we catched one. + if pool_catched_panic.swap(false, Ordering::SeqCst) { + return Err(InternalError::PanicInThreadPool.into()); + } } self.execute_prefix_databases( From b3173d0423592a12937c2a316cb7b862b0457b2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 22 Apr 2024 15:50:43 +0200 Subject: [PATCH 29/40] Remove useless dots in the error messages --- milli/src/error.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index a5ed47c7d..02691a5cf 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -39,17 +39,17 @@ pub enum InternalError { Fst(#[from] fst::Error), #[error(transparent)] DocumentsError(#[from] documents::Error), - #[error("Invalid compression type have been specified to grenad.")] + #[error("Invalid compression type have been specified to grenad")] GrenadInvalidCompressionType, - #[error("Invalid grenad file with an invalid version format.")] + #[error("Invalid grenad file with an invalid version format")] GrenadInvalidFormatVersion, - #[error("Invalid merge while processing {process}.")] + #[error("Invalid merge while processing {process}")] IndexingMergingKeys { process: &'static str }, #[error("{}", HeedError::InvalidDatabaseTyping)] InvalidDatabaseTyping, #[error(transparent)] RayonThreadPool(#[from] ThreadPoolBuildError), - #[error("A panic occured. Read the logs to find more information about it.")] + #[error("A panic occured. Read the logs to find more information about it")] PanicInThreadPool, #[error(transparent)] SerdeJson(#[from] serde_json::Error), @@ -59,9 +59,9 @@ pub enum InternalError { Store(#[from] MdbError), #[error(transparent)] Utf8(#[from] str::Utf8Error), - #[error("An indexation process was explicitly aborted.")] + #[error("An indexation process was explicitly aborted")] AbortedIndexation, - #[error("The matching words list contains at least one invalid member.")] + #[error("The matching words list contains at least one invalid member")] InvalidMatchingWords, #[error(transparent)] ArroyError(#[from] arroy::Error), From 6247e95dc31ca83b74857b620b3376f930ba9e0e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 23 Apr 2024 17:42:20 +0200 Subject: [PATCH 30/40] Add benchmark for embeddings --- workloads/movies-subset-hf-embeddings.json | 68 ++++++++++++++++++++ workloads/settings-add-embeddings.json | 72 ++++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 workloads/movies-subset-hf-embeddings.json create mode 100644 workloads/settings-add-embeddings.json diff --git a/workloads/movies-subset-hf-embeddings.json b/workloads/movies-subset-hf-embeddings.json new file mode 100644 index 000000000..d24bc752c --- /dev/null +++ b/workloads/movies-subset-hf-embeddings.json @@ -0,0 +1,68 @@ +{ + "name": "movies-subset-hf-embeddings", + "run_count": 5, + "extra_cli_args": [ + "--max-indexing-threads=4" + ], + "assets": { + "movies-100.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json", + "sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6" + } + }, + "commands": [ + { + "route": "experimental-features", + "method": "PATCH", + "body": { + "inline": { + "vectorStore": true + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "title", + "overview" + ], + "filterableAttributes": [ + "genres", + "release_date" + ], + "sortableAttributes": [ + "release_date" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "embedders": { + "default": { + "source": "huggingFace" + } + } + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "movies-100.json" + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file diff --git a/workloads/settings-add-embeddings.json b/workloads/settings-add-embeddings.json new file mode 100644 index 000000000..f87286943 --- /dev/null +++ b/workloads/settings-add-embeddings.json @@ -0,0 +1,72 @@ +{ + "name": "settings-add-embeddings-hf", + "run_count": 5, + "extra_cli_args": [ + "--max-indexing-threads=4" + ], + "assets": { + "movies-100.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json", + "sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6" + } + }, + "commands": [ + { + "route": "experimental-features", + "method": "PATCH", + "body": { + "inline": { + "vectorStore": true + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "title", + "overview" + ], + "filterableAttributes": [ + "genres", + "release_date" + ], + "sortableAttributes": [ + "release_date" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "movies-100.json" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "embedders": { + "default": { + "source": "huggingFace", + "model": null, + "revision": null, + "documentTemplate": null, + "distribution": null + } + } + } + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file From 9b765018757fbd07aa36b2a6fe0db97d2e20f9e5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 24 Apr 2024 12:31:35 +0200 Subject: [PATCH 31/40] Display set API key for Ollama embedder --- milli/src/vector/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 18a86368f..5707f9533 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -335,7 +335,7 @@ impl From for EmbeddingSettings { source: Setting::Set(EmbedderSource::Ollama), model: Setting::Set(options.embedding_model.to_owned()), revision: Setting::NotSet, - api_key: Setting::NotSet, + api_key: options.api_key.map(Setting::Set).unwrap_or_default(), dimensions: Setting::NotSet, document_template: Setting::Set(prompt.template), url: Setting::NotSet, From e87cb373de85ae1299d1b745676d835b3eb07cab Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 24 Apr 2024 12:32:34 +0200 Subject: [PATCH 32/40] Avoid intermediate serializing when displaying settings --- meilisearch/src/routes/indexes/settings.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 0918444ef..e35ebc930 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -137,10 +137,8 @@ macro_rules! make_setting_route { let settings = settings(&index, &rtxn, meilisearch_types::settings::SecretPolicy::HideSecrets)?; debug!(returns = ?settings, "Update settings"); - let mut json = serde_json::json!(&settings); - let val = json[$camelcase_attr].take(); - Ok(HttpResponse::Ok().json(val)) + Ok(HttpResponse::Ok().json(settings.$attr)) } pub fn resources() -> Resource { From d4aeff92d054c7f183d24bf813d4b73988cb15b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 24 Apr 2024 16:40:12 +0200 Subject: [PATCH 33/40] Introduce the ThreadPoolNoAbort wrapper --- meilisearch/src/option.rs | 12 +--- milli/src/error.rs | 5 +- milli/src/lib.rs | 2 + milli/src/thread_pool_no_abort.rs | 69 +++++++++++++++++++ .../extract/extract_vector_points.rs | 4 +- .../src/update/index_documents/extract/mod.rs | 4 +- milli/src/update/index_documents/mod.rs | 25 ++----- milli/src/update/indexer_config.rs | 12 +--- milli/src/vector/error.rs | 3 + milli/src/vector/mod.rs | 3 +- milli/src/vector/ollama.rs | 15 ++-- milli/src/vector/openai.rs | 15 ++-- milli/src/vector/rest.rs | 16 +++-- .../src/processor/firefox_profiler.rs | 4 +- 14 files changed, 129 insertions(+), 60 deletions(-) create mode 100644 milli/src/thread_pool_no_abort.rs diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index d3d9150d6..fed824079 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -6,7 +6,6 @@ use std::num::ParseIntError; use std::ops::Deref; use std::path::PathBuf; use std::str::FromStr; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::{env, fmt, fs}; @@ -14,6 +13,7 @@ use byte_unit::{Byte, ByteError}; use clap::Parser; use meilisearch_types::features::InstanceTogglableFeatures; use meilisearch_types::milli::update::IndexerConfig; +use meilisearch_types::milli::ThreadPoolNoAbortBuilder; use rustls::server::{ AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, ServerSessionMemoryCache, }; @@ -667,23 +667,15 @@ impl TryFrom<&IndexerOpts> for IndexerConfig { type Error = anyhow::Error; fn try_from(other: &IndexerOpts) -> Result { - let pool_panic_catched = Arc::new(AtomicBool::new(false)); - let thread_pool = rayon::ThreadPoolBuilder::new() + let thread_pool = ThreadPoolNoAbortBuilder::new() .thread_name(|index| format!("indexing-thread:{index}")) .num_threads(*other.max_indexing_threads) - .panic_handler({ - // TODO What should we do with this Box. - // So, let's just set a value to true to cancel the task with a message for now. - let panic_cathed = pool_panic_catched.clone(); - move |_result| panic_cathed.store(true, Ordering::SeqCst) - }) .build()?; Ok(Self { log_every_n: Some(DEFAULT_LOG_EVERY_N), max_memory: other.max_indexing_memory.map(|b| b.get_bytes() as usize), thread_pool: Some(thread_pool), - pool_panic_catched, max_positions_per_attributes: None, skip_index_budget: other.skip_index_budget, ..Default::default() diff --git a/milli/src/error.rs b/milli/src/error.rs index 02691a5cf..e4550de1f 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -9,6 +9,7 @@ use serde_json::Value; use thiserror::Error; use crate::documents::{self, DocumentsBatchCursorError}; +use crate::thread_pool_no_abort::PanicCatched; use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { @@ -49,8 +50,8 @@ pub enum InternalError { InvalidDatabaseTyping, #[error(transparent)] RayonThreadPool(#[from] ThreadPoolBuildError), - #[error("A panic occured. Read the logs to find more information about it")] - PanicInThreadPool, + #[error(transparent)] + PanicInThreadPool(#[from] PanicCatched), #[error(transparent)] SerdeJson(#[from] serde_json::Error), #[error(transparent)] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index cd44f2f2e..a1e240464 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -21,6 +21,7 @@ pub mod prompt; pub mod proximity; pub mod score_details; mod search; +mod thread_pool_no_abort; pub mod update; pub mod vector; @@ -42,6 +43,7 @@ pub use search::new::{ SearchLogger, VisualSearchLogger, }; use serde_json::Value; +pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; pub use {charabia as tokenizer, heed}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; diff --git a/milli/src/thread_pool_no_abort.rs b/milli/src/thread_pool_no_abort.rs new file mode 100644 index 000000000..14e5b0491 --- /dev/null +++ b/milli/src/thread_pool_no_abort.rs @@ -0,0 +1,69 @@ +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use rayon::{ThreadPool, ThreadPoolBuilder}; +use thiserror::Error; + +/// A rayon ThreadPool wrapper that can catch panics in the pool +/// and modifies the install function accordingly. +#[derive(Debug)] +pub struct ThreadPoolNoAbort { + thread_pool: ThreadPool, + /// Set to true if the thread pool catched a panic. + pool_catched_panic: Arc, +} + +impl ThreadPoolNoAbort { + pub fn install(&self, op: OP) -> Result + where + OP: FnOnce() -> R + Send, + R: Send, + { + let output = self.thread_pool.install(op); + // While reseting the pool panic catcher we return an error if we catched one. + if self.pool_catched_panic.swap(false, Ordering::SeqCst) { + Err(PanicCatched) + } else { + Ok(output) + } + } + + pub fn current_num_threads(&self) -> usize { + self.thread_pool.current_num_threads() + } +} + +#[derive(Error, Debug)] +#[error("A panic occured. Read the logs to find more information about it")] +pub struct PanicCatched; + +#[derive(Default)] +pub struct ThreadPoolNoAbortBuilder(ThreadPoolBuilder); + +impl ThreadPoolNoAbortBuilder { + pub fn new() -> ThreadPoolNoAbortBuilder { + ThreadPoolNoAbortBuilder::default() + } + + pub fn thread_name(mut self, closure: F) -> Self + where + F: FnMut(usize) -> String + 'static, + { + self.0 = self.0.thread_name(closure); + self + } + + pub fn num_threads(mut self, num_threads: usize) -> ThreadPoolNoAbortBuilder { + self.0 = self.0.num_threads(num_threads); + self + } + + pub fn build(mut self) -> Result { + let pool_catched_panic = Arc::new(AtomicBool::new(false)); + self.0 = self.0.panic_handler({ + let catched_panic = pool_catched_panic.clone(); + move |_result| catched_panic.store(true, Ordering::SeqCst) + }); + Ok(ThreadPoolNoAbort { thread_pool: self.0.build()?, pool_catched_panic }) + } +} diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 23f945c7a..9d0e7e360 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -19,7 +19,7 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::try_split_at; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::Embedder; -use crate::{DocumentId, InternalError, Result, VectorOrArrayOfVectors}; +use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -347,7 +347,7 @@ pub fn extract_embeddings( prompt_reader: grenad::Reader, indexer: GrenadParameters, embedder: Arc, - request_threads: &rayon::ThreadPool, + request_threads: &ThreadPoolNoAbort, ) -> Result>> { puffin::profile_function!(); let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index bc6fe2aff..573e0898a 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -31,7 +31,7 @@ use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; use crate::update::settings::InnerIndexSettingsDiff; -use crate::{FieldId, Result}; +use crate::{FieldId, Result, ThreadPoolNoAbortBuilder}; /// Extract data for each databases from obkv documents in parallel. /// Send data in grenad file over provided Sender. @@ -229,7 +229,7 @@ fn send_original_documents_data( let documents_chunk_cloned = original_documents_chunk.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); - let request_threads = rayon::ThreadPoolBuilder::new() + let request_threads = ThreadPoolNoAbortBuilder::new() .num_threads(crate::vector::REQUEST_PARALLELISM) .thread_name(|index| format!("embedding-request-{index}")) .build()?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c61c83757..bb180a7ee 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -8,7 +8,6 @@ use std::collections::{HashMap, HashSet}; use std::io::{Read, Seek}; use std::num::NonZeroU32; use std::result::Result as StdResult; -use std::sync::atomic::Ordering; use std::sync::Arc; use crossbeam_channel::{Receiver, Sender}; @@ -34,6 +33,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; +use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, @@ -297,17 +297,13 @@ where let settings_diff = Arc::new(settings_diff); let backup_pool; - let pool_catched_panic = self.indexer_config.pool_panic_catched.clone(); let pool = match self.indexer_config.thread_pool { Some(ref pool) => pool, None => { // We initialize a backup pool with the default // settings if none have already been set. - let mut pool_builder = rayon::ThreadPoolBuilder::new(); - pool_builder = pool_builder.panic_handler({ - let catched_panic = pool_catched_panic.clone(); - move |_result| catched_panic.store(true, Ordering::SeqCst) - }); + #[allow(unused_mut)] + let mut pool_builder = ThreadPoolNoAbortBuilder::new(); #[cfg(test)] { @@ -538,12 +534,7 @@ where } Ok(()) - })?; - - // While reseting the pool panic catcher we return an error if we catched one. - if pool_catched_panic.swap(false, Ordering::SeqCst) { - return Err(InternalError::PanicInThreadPool.into()); - } + }).map_err(InternalError::from)??; // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; @@ -572,12 +563,8 @@ where writer.build(wtxn, &mut rng, None)?; } Result::Ok(()) - })?; - - // While reseting the pool panic catcher we return an error if we catched one. - if pool_catched_panic.swap(false, Ordering::SeqCst) { - return Err(InternalError::PanicInThreadPool.into()); - } + }) + .map_err(InternalError::from)??; } self.execute_prefix_databases( diff --git a/milli/src/update/indexer_config.rs b/milli/src/update/indexer_config.rs index b23d8e700..115059a1d 100644 --- a/milli/src/update/indexer_config.rs +++ b/milli/src/update/indexer_config.rs @@ -1,8 +1,6 @@ -use std::sync::atomic::AtomicBool; -use std::sync::Arc; - use grenad::CompressionType; -use rayon::ThreadPool; + +use crate::thread_pool_no_abort::ThreadPoolNoAbort; #[derive(Debug)] pub struct IndexerConfig { @@ -12,10 +10,7 @@ pub struct IndexerConfig { pub max_memory: Option, pub chunk_compression_type: CompressionType, pub chunk_compression_level: Option, - pub thread_pool: Option, - /// Set to true if the thread pool catched a panic - /// and we must abort the task - pub pool_panic_catched: Arc, + pub thread_pool: Option, pub max_positions_per_attributes: Option, pub skip_index_budget: bool, } @@ -30,7 +25,6 @@ impl Default for IndexerConfig { chunk_compression_type: CompressionType::None, chunk_compression_level: None, thread_pool: None, - pool_panic_catched: Arc::default(), max_positions_per_attributes: None, skip_index_budget: false, } diff --git a/milli/src/vector/error.rs b/milli/src/vector/error.rs index d3369ef3d..650e1171e 100644 --- a/milli/src/vector/error.rs +++ b/milli/src/vector/error.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use hf_hub::api::sync::ApiError; use crate::error::FaultSource; +use crate::PanicCatched; #[derive(Debug, thiserror::Error)] #[error("Error while generating embeddings: {inner}")] @@ -80,6 +81,8 @@ pub enum EmbedErrorKind { OpenAiUnexpectedDimension(usize, usize), #[error("no embedding was produced")] MissingEmbedding, + #[error(transparent)] + PanicInThreadPool(#[from] PanicCatched), } impl EmbedError { diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 58f7ba5e1..306c1c1e9 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize}; use self::error::{EmbedError, NewEmbedderError}; use crate::prompt::{Prompt, PromptData}; +use crate::ThreadPoolNoAbort; pub mod error; pub mod hf; @@ -254,7 +255,7 @@ impl Embedder { pub fn embed_chunks( &self, text_chunks: Vec>, - threads: &rayon::ThreadPool, + threads: &ThreadPoolNoAbort, ) -> std::result::Result>>, EmbedError> { match self { Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks), diff --git a/milli/src/vector/ollama.rs b/milli/src/vector/ollama.rs index cf5030fb4..2c29cc816 100644 --- a/milli/src/vector/ollama.rs +++ b/milli/src/vector/ollama.rs @@ -3,6 +3,8 @@ use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; use super::{DistributionShift, Embeddings}; +use crate::error::FaultSource; +use crate::ThreadPoolNoAbort; #[derive(Debug)] pub struct Embedder { @@ -71,11 +73,16 @@ impl Embedder { pub fn embed_chunks( &self, text_chunks: Vec>, - threads: &rayon::ThreadPool, + threads: &ThreadPoolNoAbort, ) -> Result>>, EmbedError> { - threads.install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() - }) + threads + .install(move || { + text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? } pub fn chunk_count_hint(&self) -> usize { diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index 141de486b..e180aedaa 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -4,7 +4,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator as _}; use super::error::{EmbedError, NewEmbedderError}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; use super::{DistributionShift, Embeddings}; +use crate::error::FaultSource; use crate::vector::error::EmbedErrorKind; +use crate::ThreadPoolNoAbort; #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] pub struct EmbedderOptions { @@ -241,11 +243,16 @@ impl Embedder { pub fn embed_chunks( &self, text_chunks: Vec>, - threads: &rayon::ThreadPool, + threads: &ThreadPoolNoAbort, ) -> Result>>, EmbedError> { - threads.install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() - }) + threads + .install(move || { + text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? } pub fn chunk_count_hint(&self) -> usize { diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index b0ea07f82..60f54782e 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -2,9 +2,12 @@ use deserr::Deserr; use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; use serde::{Deserialize, Serialize}; +use super::error::EmbedErrorKind; use super::{ DistributionShift, EmbedError, Embedding, Embeddings, NewEmbedderError, REQUEST_PARALLELISM, }; +use crate::error::FaultSource; +use crate::ThreadPoolNoAbort; // retrying in case of failure @@ -158,11 +161,16 @@ impl Embedder { pub fn embed_chunks( &self, text_chunks: Vec>, - threads: &rayon::ThreadPool, + threads: &ThreadPoolNoAbort, ) -> Result>>, EmbedError> { - threads.install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() - }) + threads + .install(move || { + text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? } pub fn chunk_count_hint(&self) -> usize { diff --git a/tracing-trace/src/processor/firefox_profiler.rs b/tracing-trace/src/processor/firefox_profiler.rs index bae8ea44a..da3380e5c 100644 --- a/tracing-trace/src/processor/firefox_profiler.rs +++ b/tracing-trace/src/processor/firefox_profiler.rs @@ -217,9 +217,7 @@ fn add_memory_samples( memory_counters: &mut Option, last_memory: &mut MemoryStats, ) -> Option { - let Some(stats) = memory else { - return None; - }; + let stats = memory?; let memory_counters = memory_counters.get_or_insert_with(|| MemoryCounterHandles::new(profile, main)); From 7468c1cf8d899045e70710e366ddcca046170d08 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 24 Apr 2024 18:15:03 +0200 Subject: [PATCH 34/40] Introduce WildcardSetting that are serialized as wildcards by default --- dump/src/lib.rs | 4 +- dump/src/reader/compat/v5_to_v6.rs | 4 +- .../after_registering_settings_task.snap | 2 +- .../settings_update_processed.snap | 2 +- meilisearch-types/src/settings.rs | 90 ++++++++++++------- meilisearch/src/routes/indexes/settings.rs | 2 +- 6 files changed, 67 insertions(+), 37 deletions(-) diff --git a/dump/src/lib.rs b/dump/src/lib.rs index a7af2d5d0..42cb0e444 100644 --- a/dump/src/lib.rs +++ b/dump/src/lib.rs @@ -256,8 +256,8 @@ pub(crate) mod test { pub fn create_test_settings() -> Settings { let settings = Settings { - displayed_attributes: Setting::Set(vec![S("race"), S("name")]), - searchable_attributes: Setting::Set(vec![S("name"), S("race")]), + displayed_attributes: Setting::Set(vec![S("race"), S("name")]).into(), + searchable_attributes: Setting::Set(vec![S("name"), S("race")]).into(), filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }), sortable_attributes: Setting::Set(btreeset! { S("age") }), ranking_rules: Setting::NotSet, diff --git a/dump/src/reader/compat/v5_to_v6.rs b/dump/src/reader/compat/v5_to_v6.rs index a883f0ba0..c022507ca 100644 --- a/dump/src/reader/compat/v5_to_v6.rs +++ b/dump/src/reader/compat/v5_to_v6.rs @@ -315,8 +315,8 @@ impl From for v6::ResponseError { impl From> for v6::Settings { fn from(settings: v5::Settings) -> Self { v6::Settings { - displayed_attributes: settings.displayed_attributes.into(), - searchable_attributes: settings.searchable_attributes.into(), + displayed_attributes: v6::Setting::from(settings.displayed_attributes).into(), + searchable_attributes: v6::Setting::from(settings.searchable_attributes).into(), filterable_attributes: settings.filterable_attributes.into(), sortable_attributes: settings.sortable_attributes.into(), ranking_rules: { diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap index f3b94fb3c..dad082667 100644 --- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap +++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap index 830331f61..271db8765 100644 --- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap +++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index ce3a74d69..10995df42 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -3,7 +3,7 @@ use std::convert::Infallible; use std::fmt; use std::marker::PhantomData; use std::num::NonZeroUsize; -use std::ops::ControlFlow; +use std::ops::{ControlFlow, Deref}; use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; @@ -143,21 +143,13 @@ impl MergeWithError for DeserrJsonError { - #[serde( - default, - serialize_with = "serialize_with_wildcard", - skip_serializing_if = "Setting::is_not_set" - )] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] - pub displayed_attributes: Setting>, + pub displayed_attributes: WildcardSetting, - #[serde( - default, - serialize_with = "serialize_with_wildcard", - skip_serializing_if = "Setting::is_not_set" - )] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] - pub searchable_attributes: Setting>, + pub searchable_attributes: WildcardSetting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] @@ -251,8 +243,8 @@ impl Settings { impl Settings { pub fn cleared() -> Settings { Settings { - displayed_attributes: Setting::Reset, - searchable_attributes: Setting::Reset, + displayed_attributes: Setting::Reset.into(), + searchable_attributes: Setting::Reset.into(), filterable_attributes: Setting::Reset, sortable_attributes: Setting::Reset, ranking_rules: Setting::Reset, @@ -319,7 +311,7 @@ impl Settings { impl Settings { pub fn check(self) -> Settings { - let displayed_attributes = match self.displayed_attributes { + let displayed_attributes = match self.displayed_attributes.0 { Setting::Set(fields) => { if fields.iter().any(|f| f == "*") { Setting::Reset @@ -330,7 +322,7 @@ impl Settings { otherwise => otherwise, }; - let searchable_attributes = match self.searchable_attributes { + let searchable_attributes = match self.searchable_attributes.0 { Setting::Set(fields) => { if fields.iter().any(|f| f == "*") { Setting::Reset @@ -342,8 +334,8 @@ impl Settings { }; Settings { - displayed_attributes, - searchable_attributes, + displayed_attributes: displayed_attributes.into(), + searchable_attributes: searchable_attributes.into(), filterable_attributes: self.filterable_attributes, sortable_attributes: self.sortable_attributes, ranking_rules: self.ranking_rules, @@ -412,13 +404,13 @@ pub fn apply_settings_to_builder( _kind, } = settings; - match searchable_attributes { + match searchable_attributes.deref() { Setting::Set(ref names) => builder.set_searchable_fields(names.clone()), Setting::Reset => builder.reset_searchable_fields(), Setting::NotSet => (), } - match displayed_attributes { + match displayed_attributes.deref() { Setting::Set(ref names) => builder.set_displayed_fields(names.clone()), Setting::Reset => builder.reset_displayed_fields(), Setting::NotSet => (), @@ -690,11 +682,13 @@ pub fn settings( displayed_attributes: match displayed_attributes { Some(attrs) => Setting::Set(attrs), None => Setting::Reset, - }, + } + .into(), searchable_attributes: match searchable_attributes { - Some(attrs) => Setting::Set(attrs), + Some(attrs) => Setting::Set(attrs).into(), None => Setting::Reset, - }, + } + .into(), filterable_attributes: Setting::Set(filterable_attributes), sortable_attributes: Setting::Set(sortable_attributes), ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()), @@ -848,6 +842,41 @@ impl From for ProximityPrecision { } } +#[derive(Debug, Clone, Default, Deserialize, PartialEq, Eq)] +pub struct WildcardSetting(Setting>); + +impl From>> for WildcardSetting { + fn from(setting: Setting>) -> Self { + Self(setting) + } +} + +impl Serialize for WildcardSetting { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serialize_with_wildcard(&self.0, serializer) + } +} + +impl Deserr for WildcardSetting { + fn deserialize_from_value( + value: deserr::Value, + location: ValuePointerRef<'_>, + ) -> Result { + Ok(Self(Setting::deserialize_from_value(value, location)?)) + } +} + +impl std::ops::Deref for WildcardSetting { + type Target = Setting>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + #[cfg(test)] pub(crate) mod test { use super::*; @@ -856,8 +885,8 @@ pub(crate) mod test { fn test_setting_check() { // test no changes let settings = Settings { - displayed_attributes: Setting::Set(vec![String::from("hello")]), - searchable_attributes: Setting::Set(vec![String::from("hello")]), + displayed_attributes: Setting::Set(vec![String::from("hello")]).into(), + searchable_attributes: Setting::Set(vec![String::from("hello")]).into(), filterable_attributes: Setting::NotSet, sortable_attributes: Setting::NotSet, ranking_rules: Setting::NotSet, @@ -883,8 +912,9 @@ pub(crate) mod test { // test wildcard // test no changes let settings = Settings { - displayed_attributes: Setting::Set(vec![String::from("*")]), - searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]), + displayed_attributes: Setting::Set(vec![String::from("*")]).into(), + searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]) + .into(), filterable_attributes: Setting::NotSet, sortable_attributes: Setting::NotSet, ranking_rules: Setting::NotSet, @@ -904,7 +934,7 @@ pub(crate) mod test { }; let checked = settings.check(); - assert_eq!(checked.displayed_attributes, Setting::Reset); - assert_eq!(checked.searchable_attributes, Setting::Reset); + assert_eq!(checked.displayed_attributes, Setting::Reset.into()); + assert_eq!(checked.searchable_attributes, Setting::Reset.into()); } } diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index e35ebc930..4fcd16902 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -138,7 +138,7 @@ macro_rules! make_setting_route { debug!(returns = ?settings, "Update settings"); - Ok(HttpResponse::Ok().json(settings.$attr)) + Ok(HttpResponse::Ok().json(dbg!(settings.$attr))) } pub fn resources() -> Resource { From dbcf50589b5997aac3895af93cfe15f96647c911 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 25 Apr 2024 10:36:10 +0200 Subject: [PATCH 35/40] Fix clippy --- meilisearch-types/src/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 10995df42..223d71658 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -685,7 +685,7 @@ pub fn settings( } .into(), searchable_attributes: match searchable_attributes { - Some(attrs) => Setting::Set(attrs).into(), + Some(attrs) => Setting::Set(attrs), None => Setting::Reset, } .into(), From cbbfff3594c14fcfe4265123d49f2ec5bad7730a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 25 Apr 2024 10:37:18 +0200 Subject: [PATCH 36/40] Remove debuging prints --- meilisearch/src/routes/indexes/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 4fcd16902..e35ebc930 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -138,7 +138,7 @@ macro_rules! make_setting_route { debug!(returns = ?settings, "Update settings"); - Ok(HttpResponse::Ok().json(dbg!(settings.$attr))) + Ok(HttpResponse::Ok().json(settings.$attr)) } pub fn resources() -> Resource { From 88174b8ae40fba726b0039ff407e3b048d9bd878 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 30 Apr 2024 14:30:23 +0200 Subject: [PATCH 37/40] Update charabia v0.8.10 --- meilisearch-types/Cargo.toml | 2 ++ meilisearch/Cargo.toml | 1 + milli/Cargo.toml | 6 +++++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index 1973a2034..6d23f144a 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -57,3 +57,5 @@ greek = ["milli/greek"] khmer = ["milli/khmer"] # allow vietnamese specialized tokenization vietnamese = ["milli/vietnamese"] +# force swedish character recomposition +swedish-recomposition = ["milli/swedish-recomposition"] diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 6b8db4144..4a2b11b21 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -156,6 +156,7 @@ thai = ["meilisearch-types/thai"] greek = ["meilisearch-types/greek"] khmer = ["meilisearch-types/khmer"] vietnamese = ["meilisearch-types/vietnamese"] +swedish-recomposition = ["meilisearch-types/swedish-recomposition"] [package.metadata.mini-dashboard] assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 9423a854e..082cd0812 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ bincode = "1.3.3" bstr = "1.9.0" bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.8.9", default-features = false } +charabia = { version = "0.8.10", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.11" deserr = "0.6.1" @@ -136,7 +136,11 @@ greek = ["charabia/greek"] # allow khmer specialized tokenization khmer = ["charabia/khmer"] +# allow vietnamese specialized tokenization vietnamese = ["charabia/vietnamese"] +# force swedish character recomposition +swedish-recomposition = ["charabia/swedish-recomposition"] + # allow CUDA support, see cuda = ["candle-core/cuda"] From fe51ceca6dea55ff3eea89660afa51a76ca8b893 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 30 Apr 2024 14:33:37 +0200 Subject: [PATCH 38/40] Update lock file --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e2ddfd2ed..fad60e8da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -889,9 +889,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.8.9" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6a65052f308636e5d5e1777f0dbc07919f5fbac24b6c8ad3e140472e5520de9" +checksum = "933f20f2269b24d32fd5503e7b3c268af902190daf8d9d2b73ed2e75d77c00b4" dependencies = [ "aho-corasick", "cow-utils", From f4dd73ec8ce6e29d23e3c86ca40de57fb935dcbd Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 2 May 2024 15:39:36 +0200 Subject: [PATCH 39/40] Destructure EmbedderOptions so we don't miss some options --- milli/src/vector/settings.rs | 51 ++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 5707f9533..e786a7164 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -301,10 +301,14 @@ impl From for EmbeddingSettings { fn from(value: EmbeddingConfig) -> Self { let EmbeddingConfig { embedder_options, prompt } = value; match embedder_options { - super::EmbedderOptions::HuggingFace(options) => Self { + super::EmbedderOptions::HuggingFace(super::hf::EmbedderOptions { + model, + revision, + distribution, + }) => Self { source: Setting::Set(EmbedderSource::HuggingFace), - model: Setting::Set(options.model), - revision: options.revision.map(Setting::Set).unwrap_or_default(), + model: Setting::Set(model), + revision: revision.map(Setting::Set).unwrap_or_default(), api_key: Setting::NotSet, dimensions: Setting::NotSet, document_template: Setting::Set(prompt.template), @@ -314,14 +318,19 @@ impl From for EmbeddingSettings { path_to_embeddings: Setting::NotSet, embedding_object: Setting::NotSet, input_type: Setting::NotSet, - distribution: options.distribution.map(Setting::Set).unwrap_or_default(), + distribution: distribution.map(Setting::Set).unwrap_or_default(), }, - super::EmbedderOptions::OpenAi(options) => Self { + super::EmbedderOptions::OpenAi(super::openai::EmbedderOptions { + api_key, + embedding_model, + dimensions, + distribution, + }) => Self { source: Setting::Set(EmbedderSource::OpenAi), - model: Setting::Set(options.embedding_model.name().to_owned()), + model: Setting::Set(embedding_model.name().to_owned()), revision: Setting::NotSet, - api_key: options.api_key.map(Setting::Set).unwrap_or_default(), - dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(), + api_key: api_key.map(Setting::Set).unwrap_or_default(), + dimensions: dimensions.map(Setting::Set).unwrap_or_default(), document_template: Setting::Set(prompt.template), url: Setting::NotSet, query: Setting::NotSet, @@ -329,29 +338,37 @@ impl From for EmbeddingSettings { path_to_embeddings: Setting::NotSet, embedding_object: Setting::NotSet, input_type: Setting::NotSet, - distribution: options.distribution.map(Setting::Set).unwrap_or_default(), + distribution: distribution.map(Setting::Set).unwrap_or_default(), }, - super::EmbedderOptions::Ollama(options) => Self { + super::EmbedderOptions::Ollama(super::ollama::EmbedderOptions { + embedding_model, + url, + api_key, + distribution, + }) => Self { source: Setting::Set(EmbedderSource::Ollama), - model: Setting::Set(options.embedding_model.to_owned()), + model: Setting::Set(embedding_model), revision: Setting::NotSet, - api_key: options.api_key.map(Setting::Set).unwrap_or_default(), + api_key: api_key.map(Setting::Set).unwrap_or_default(), dimensions: Setting::NotSet, document_template: Setting::Set(prompt.template), - url: Setting::NotSet, + url: url.map(Setting::Set).unwrap_or_default(), query: Setting::NotSet, input_field: Setting::NotSet, path_to_embeddings: Setting::NotSet, embedding_object: Setting::NotSet, input_type: Setting::NotSet, - distribution: options.distribution.map(Setting::Set).unwrap_or_default(), + distribution: distribution.map(Setting::Set).unwrap_or_default(), }, - super::EmbedderOptions::UserProvided(options) => Self { + super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions { + dimensions, + distribution, + }) => Self { source: Setting::Set(EmbedderSource::UserProvided), model: Setting::NotSet, revision: Setting::NotSet, api_key: Setting::NotSet, - dimensions: Setting::Set(options.dimensions), + dimensions: Setting::Set(dimensions), document_template: Setting::NotSet, url: Setting::NotSet, query: Setting::NotSet, @@ -359,7 +376,7 @@ impl From for EmbeddingSettings { path_to_embeddings: Setting::NotSet, embedding_object: Setting::NotSet, input_type: Setting::NotSet, - distribution: options.distribution.map(Setting::Set).unwrap_or_default(), + distribution: distribution.map(Setting::Set).unwrap_or_default(), }, super::EmbedderOptions::Rest(super::rest::EmbedderOptions { api_key, From 5a305bfdea0ec1e47e406797c0d4ca81da377060 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 2 May 2024 16:14:37 +0200 Subject: [PATCH 40/40] Remove unused struct --- meilisearch/src/routes/mod.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 0e6ab7765..c25aeee70 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -367,12 +367,6 @@ async fn get_version( }) } -#[derive(Serialize)] -struct KeysResponse { - private: Option, - public: Option, -} - pub async fn get_health( index_scheduler: Data, auth_controller: Data,