meilisearch/meilisearch-lib/src/index/updates.rs

517 lines
18 KiB
Rust
Raw Normal View History

use std::collections::{BTreeMap, BTreeSet};
2021-05-10 23:30:09 +08:00
use std::marker::PhantomData;
2021-05-12 23:04:24 +08:00
use std::num::NonZeroUsize;
2021-03-04 18:56:32 +08:00
2021-06-23 16:41:55 +08:00
use log::{debug, info, trace};
2021-09-15 00:39:02 +08:00
use milli::documents::DocumentBatchReader;
use milli::update::{
2022-01-19 18:21:19 +08:00
DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
Setting,
};
2021-05-12 23:04:24 +08:00
use serde::{Deserialize, Serialize, Serializer};
2021-09-15 00:39:02 +08:00
use uuid::Uuid;
2021-03-04 18:56:32 +08:00
2021-10-27 01:36:48 +08:00
use super::error::Result;
2021-10-04 18:15:21 +08:00
use super::index::{Index, IndexMeta};
use crate::update_file_store::UpdateFileStore;
fn serialize_with_wildcard<S>(
2021-08-25 02:55:29 +08:00
field: &Setting<Vec<String>>,
s: S,
) -> std::result::Result<S::Ok, S::Error>
2021-05-12 23:04:24 +08:00
where
S: Serializer,
{
let wildcard = vec!["*".to_string()];
2021-08-25 02:55:29 +08:00
match field {
Setting::Set(value) => Some(value),
Setting::Reset => Some(&wildcard),
Setting::NotSet => None,
}
.serialize(s)
2021-05-12 23:04:24 +08:00
}
2021-03-04 18:56:32 +08:00
#[derive(Clone, Default, Debug, Serialize, PartialEq)]
2021-05-10 23:30:09 +08:00
pub struct Checked;
2021-08-25 02:55:29 +08:00
#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq)]
2021-05-10 23:30:09 +08:00
pub struct Unchecked;
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
2022-03-17 18:59:35 +08:00
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct MinWordSizeTyposSetting {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub one_typo: Setting<u8>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub two_typos: Setting<u8>,
}
2022-03-17 18:59:35 +08:00
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
2022-03-17 18:59:35 +08:00
pub struct TypoSettings {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub enabled: Setting<bool>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub min_word_size_for_typos: Setting<MinWordSizeTyposSetting>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub disable_on_words: Setting<BTreeSet<String>>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub disable_on_attributes: Setting<BTreeSet<String>>,
2022-03-17 18:59:35 +08:00
}
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct FacetingSettings {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub max_values_per_facet: Setting<usize>,
}
2021-09-22 21:07:04 +08:00
/// Holds all the settings for an index. `T` can either be `Checked` if they represents settings
/// whose validity is guaranteed, or `Unchecked` if they need to be validated. In the later case, a
/// call to `check` will return a `Settings<Checked>` from a `Settings<Unchecked>`.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
2021-03-04 18:56:32 +08:00
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
#[serde(bound(serialize = "T: Serialize", deserialize = "T: Deserialize<'static>"))]
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
2021-05-10 23:30:09 +08:00
pub struct Settings<T> {
2021-03-04 18:56:32 +08:00
#[serde(
default,
2021-05-12 23:04:24 +08:00
serialize_with = "serialize_with_wildcard",
2021-08-25 02:55:29 +08:00
skip_serializing_if = "Setting::is_not_set"
2021-03-04 18:56:32 +08:00
)]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
2021-08-25 02:55:29 +08:00
pub displayed_attributes: Setting<Vec<String>>,
2021-03-04 18:56:32 +08:00
#[serde(
default,
2021-05-12 23:04:24 +08:00
serialize_with = "serialize_with_wildcard",
2021-08-25 02:55:29 +08:00
skip_serializing_if = "Setting::is_not_set"
2021-06-03 20:19:56 +08:00
)]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
2021-08-25 02:55:29 +08:00
pub searchable_attributes: Setting<Vec<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub filterable_attributes: Setting<BTreeSet<String>>,
2021-08-25 02:55:29 +08:00
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub sortable_attributes: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
2021-08-25 02:55:29 +08:00
pub ranking_rules: Setting<Vec<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
2021-08-25 02:55:29 +08:00
pub stop_words: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
2021-08-25 02:55:29 +08:00
pub synonyms: Setting<BTreeMap<String, Vec<String>>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
2021-08-25 02:55:29 +08:00
pub distinct_attribute: Setting<String>,
2022-03-17 18:59:35 +08:00
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub typo_tolerance: Setting<TypoSettings>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub faceting: Setting<FacetingSettings>,
2021-05-10 23:30:09 +08:00
#[serde(skip)]
pub _kind: PhantomData<T>,
2021-03-04 18:56:32 +08:00
}
2021-05-10 23:30:09 +08:00
impl Settings<Checked> {
pub fn cleared() -> Settings<Checked> {
Settings {
2021-08-25 02:55:29 +08:00
displayed_attributes: Setting::Reset,
searchable_attributes: Setting::Reset,
filterable_attributes: Setting::Reset,
sortable_attributes: Setting::Reset,
2021-08-25 02:55:29 +08:00
ranking_rules: Setting::Reset,
stop_words: Setting::Reset,
synonyms: Setting::Reset,
distinct_attribute: Setting::Reset,
typo_tolerance: Setting::Reset,
faceting: Setting::Reset,
2021-05-10 23:30:09 +08:00
_kind: PhantomData,
2021-03-04 18:56:32 +08:00
}
}
2021-05-27 20:30:20 +08:00
pub fn into_unchecked(self) -> Settings<Unchecked> {
let Self {
displayed_attributes,
searchable_attributes,
filterable_attributes,
sortable_attributes,
2021-05-27 20:30:20 +08:00
ranking_rules,
stop_words,
2021-06-03 20:19:56 +08:00
synonyms,
2021-05-27 20:30:20 +08:00
distinct_attribute,
typo_tolerance,
faceting,
2021-05-27 20:30:20 +08:00
..
} = self;
Settings {
displayed_attributes,
searchable_attributes,
filterable_attributes,
sortable_attributes,
2021-05-27 20:30:20 +08:00
ranking_rules,
stop_words,
2021-06-03 20:19:56 +08:00
synonyms,
2021-05-27 20:30:20 +08:00
distinct_attribute,
typo_tolerance,
faceting,
2021-05-27 20:30:20 +08:00
_kind: PhantomData,
}
}
2021-03-04 18:56:32 +08:00
}
2021-05-10 23:30:09 +08:00
impl Settings<Unchecked> {
2021-08-25 02:55:29 +08:00
pub fn check(self) -> Settings<Checked> {
let displayed_attributes = match self.displayed_attributes {
Setting::Set(fields) => {
2021-05-11 00:22:41 +08:00
if fields.iter().any(|f| f == "*") {
2021-08-25 02:55:29 +08:00
Setting::Reset
2021-05-11 00:22:41 +08:00
} else {
2021-08-25 02:55:29 +08:00
Setting::Set(fields)
2021-05-11 00:22:41 +08:00
}
}
otherwise => otherwise,
};
2021-08-25 02:55:29 +08:00
let searchable_attributes = match self.searchable_attributes {
Setting::Set(fields) => {
2021-05-11 00:22:41 +08:00
if fields.iter().any(|f| f == "*") {
2021-08-25 02:55:29 +08:00
Setting::Reset
2021-05-11 00:22:41 +08:00
} else {
2021-08-25 02:55:29 +08:00
Setting::Set(fields)
2021-05-11 00:22:41 +08:00
}
}
otherwise => otherwise,
};
Settings {
displayed_attributes,
searchable_attributes,
filterable_attributes: self.filterable_attributes,
sortable_attributes: self.sortable_attributes,
2021-05-11 00:22:41 +08:00
ranking_rules: self.ranking_rules,
stop_words: self.stop_words,
2021-06-03 20:19:56 +08:00
synonyms: self.synonyms,
2021-05-11 00:22:41 +08:00
distinct_attribute: self.distinct_attribute,
typo_tolerance: self.typo_tolerance,
faceting: self.faceting,
2021-05-11 00:22:41 +08:00
_kind: PhantomData,
}
2021-05-10 23:30:09 +08:00
}
}
2021-03-04 18:56:32 +08:00
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct Facets {
pub level_group_size: Option<NonZeroUsize>,
pub min_level_size: Option<NonZeroUsize>,
}
impl Index {
fn update_primary_key_txn<'a, 'b>(
&'a self,
txn: &mut milli::heed::RwTxn<'a, 'b>,
primary_key: String,
) -> Result<IndexMeta> {
2022-01-19 18:21:19 +08:00
let mut builder = milli::update::Settings::new(txn, self, self.indexer_config.as_ref());
builder.set_primary_key(primary_key);
builder.execute(|_| ())?;
let meta = IndexMeta::new_txn(self, txn)?;
Ok(meta)
}
2021-09-24 21:21:07 +08:00
pub fn update_primary_key(&self, primary_key: String) -> Result<IndexMeta> {
let mut txn = self.write_txn()?;
let res = self.update_primary_key_txn(&mut txn, primary_key)?;
txn.commit()?;
2021-09-24 21:21:07 +08:00
Ok(res)
}
2021-09-24 20:55:57 +08:00
/// Deletes `ids` from the index, and returns how many documents were deleted.
pub fn delete_documents(&self, ids: &[String]) -> Result<DocumentDeletionResult> {
let mut txn = self.write_txn()?;
2022-01-19 18:21:19 +08:00
let mut builder = milli::update::DeleteDocuments::new(&mut txn, self)?;
2021-09-29 18:02:27 +08:00
// We ignore unexisting document ids
ids.iter().for_each(|id| {
builder.delete_external_id(id);
});
let deleted = builder.execute()?;
txn.commit()?;
Ok(deleted)
2021-09-24 17:53:11 +08:00
}
pub fn clear_documents(&self) -> Result<()> {
let mut txn = self.write_txn()?;
2022-01-19 18:21:19 +08:00
milli::update::ClearDocuments::new(&mut txn, self).execute()?;
txn.commit()?;
Ok(())
2021-09-24 17:53:11 +08:00
}
pub fn update_documents(
&self,
2021-05-12 22:21:37 +08:00
method: IndexDocumentsMethod,
primary_key: Option<String>,
file_store: UpdateFileStore,
2022-01-19 18:21:19 +08:00
contents: impl IntoIterator<Item = Uuid>,
) -> Result<DocumentAdditionResult> {
2021-06-23 16:41:55 +08:00
trace!("performing document addition");
let mut txn = self.write_txn()?;
2021-03-04 18:56:32 +08:00
if let Some(primary_key) = primary_key {
if self.primary_key(&txn)?.is_none() {
self.update_primary_key_txn(&mut txn, primary_key)?;
}
2021-03-04 18:56:32 +08:00
}
2022-01-19 18:21:19 +08:00
let config = IndexDocumentsConfig {
update_method: method,
..Default::default()
};
2021-05-25 22:33:09 +08:00
2022-01-19 18:21:19 +08:00
let indexing_callback = |indexing_step| debug!("update: {:?}", indexing_step);
let mut builder = milli::update::IndexDocuments::new(
&mut txn,
self,
self.indexer_config.as_ref(),
config,
indexing_callback,
)?;
2022-01-19 18:21:19 +08:00
for content_uuid in contents.into_iter() {
let content_file = file_store.get_update(content_uuid)?;
let reader = DocumentBatchReader::from_reader(content_file)?;
builder.add_documents(reader)?;
}
2021-09-15 00:39:02 +08:00
2022-01-19 18:21:19 +08:00
let addition = builder.execute()?;
2021-03-04 18:56:32 +08:00
txn.commit()?;
2021-05-12 22:21:37 +08:00
info!("document addition done: {:?}", addition);
2021-03-04 18:56:32 +08:00
Ok(addition)
2021-03-04 18:56:32 +08:00
}
pub fn update_settings(&self, settings: &Settings<Checked>) -> Result<()> {
2021-09-24 20:55:57 +08:00
// We must use the write transaction of the update here.
let mut txn = self.write_txn()?;
2022-01-19 18:21:19 +08:00
let mut builder =
milli::update::Settings::new(&mut txn, self, self.indexer_config.as_ref());
2021-09-15 00:39:02 +08:00
2021-09-28 17:59:55 +08:00
apply_settings_to_builder(settings, &mut builder);
2021-09-24 20:55:57 +08:00
builder.execute(|indexing_step| debug!("update: {:?}", indexing_step))?;
txn.commit()?;
2021-09-24 20:55:57 +08:00
Ok(())
2021-09-28 17:59:55 +08:00
}
}
2021-09-24 20:55:57 +08:00
2021-09-29 04:22:59 +08:00
pub fn apply_settings_to_builder(
settings: &Settings<Checked>,
builder: &mut milli::update::Settings,
) {
2021-09-28 17:59:55 +08:00
match settings.searchable_attributes {
Setting::Set(ref names) => builder.set_searchable_fields(names.clone()),
Setting::Reset => builder.reset_searchable_fields(),
Setting::NotSet => (),
}
2021-09-24 20:55:57 +08:00
2021-09-28 17:59:55 +08:00
match settings.displayed_attributes {
Setting::Set(ref names) => builder.set_displayed_fields(names.clone()),
Setting::Reset => builder.reset_displayed_fields(),
Setting::NotSet => (),
}
2021-09-24 20:55:57 +08:00
2021-09-28 17:59:55 +08:00
match settings.filterable_attributes {
Setting::Set(ref facets) => {
builder.set_filterable_fields(facets.clone().into_iter().collect())
2021-09-24 20:55:57 +08:00
}
2021-09-28 17:59:55 +08:00
Setting::Reset => builder.reset_filterable_fields(),
Setting::NotSet => (),
}
2021-09-24 20:55:57 +08:00
2021-09-28 17:59:55 +08:00
match settings.sortable_attributes {
2021-09-29 04:22:59 +08:00
Setting::Set(ref fields) => builder.set_sortable_fields(fields.iter().cloned().collect()),
2021-09-28 17:59:55 +08:00
Setting::Reset => builder.reset_sortable_fields(),
Setting::NotSet => (),
}
2021-09-24 20:55:57 +08:00
2021-09-28 17:59:55 +08:00
match settings.ranking_rules {
Setting::Set(ref criteria) => builder.set_criteria(criteria.clone()),
Setting::Reset => builder.reset_criteria(),
Setting::NotSet => (),
}
2021-09-24 20:55:57 +08:00
2021-09-28 17:59:55 +08:00
match settings.stop_words {
Setting::Set(ref stop_words) => builder.set_stop_words(stop_words.clone()),
Setting::Reset => builder.reset_stop_words(),
Setting::NotSet => (),
}
2021-09-24 20:55:57 +08:00
2021-09-28 17:59:55 +08:00
match settings.synonyms {
2021-09-29 04:22:59 +08:00
Setting::Set(ref synonyms) => builder.set_synonyms(synonyms.clone().into_iter().collect()),
2021-09-28 17:59:55 +08:00
Setting::Reset => builder.reset_synonyms(),
Setting::NotSet => (),
}
match settings.distinct_attribute {
Setting::Set(ref attr) => builder.set_distinct_field(attr.clone()),
Setting::Reset => builder.reset_distinct_field(),
Setting::NotSet => (),
2021-09-24 20:55:57 +08:00
}
2022-03-17 18:59:35 +08:00
match settings.typo_tolerance {
Setting::Set(ref value) => {
match value.enabled {
Setting::Set(val) => builder.set_autorize_typos(val),
Setting::Reset => builder.reset_authorize_typos(),
Setting::NotSet => (),
}
match value.min_word_size_for_typos {
Setting::Set(ref setting) => {
match setting.one_typo {
Setting::Set(val) => builder.set_min_word_len_one_typo(val),
Setting::Reset => builder.reset_min_word_len_one_typo(),
Setting::NotSet => (),
}
match setting.two_typos {
Setting::Set(val) => builder.set_min_word_len_two_typos(val),
Setting::Reset => builder.reset_min_word_len_two_typos(),
Setting::NotSet => (),
}
}
Setting::Reset => {
builder.reset_min_word_len_one_typo();
builder.reset_min_word_len_two_typos();
}
Setting::NotSet => (),
}
match value.disable_on_words {
Setting::Set(ref words) => {
builder.set_exact_words(words.clone());
}
Setting::Reset => builder.reset_exact_words(),
Setting::NotSet => (),
}
match value.disable_on_attributes {
Setting::Set(ref words) => {
builder.set_exact_attributes(words.iter().cloned().collect())
}
Setting::Reset => builder.reset_exact_attributes(),
Setting::NotSet => (),
}
}
2022-03-17 18:59:35 +08:00
Setting::Reset => {
// all typo settings need to be reset here.
builder.reset_authorize_typos();
2022-03-17 18:59:35 +08:00
builder.reset_min_word_len_one_typo();
builder.reset_min_word_len_two_typos();
builder.reset_exact_words();
builder.reset_exact_attributes();
2022-03-17 18:59:35 +08:00
}
Setting::NotSet => (),
}
match settings.faceting {
Setting::Set(ref value) => match value.max_values_per_facet {
Setting::Set(val) => builder.set_max_values_per_facet(val),
Setting::Reset => builder.reset_max_values_per_facet(),
Setting::NotSet => (),
},
Setting::Reset => builder.reset_max_values_per_facet(),
Setting::NotSet => (),
2022-03-17 18:59:35 +08:00
}
2021-03-04 18:56:32 +08:00
}
2021-05-11 00:34:25 +08:00
#[cfg(test)]
pub(crate) mod test {
use proptest::prelude::*;
2021-05-11 00:34:25 +08:00
use super::*;
pub(super) fn setting_strategy<T: Arbitrary + Clone>() -> impl Strategy<Value = Setting<T>> {
prop_oneof![
Just(Setting::NotSet),
Just(Setting::Reset),
any::<T>().prop_map(Setting::Set)
]
}
2021-05-11 00:34:25 +08:00
#[test]
fn test_setting_check() {
// test no changes
let settings = Settings {
2021-08-25 02:55:29 +08:00
displayed_attributes: Setting::Set(vec![String::from("hello")]),
searchable_attributes: Setting::Set(vec![String::from("hello")]),
filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet,
2021-08-25 02:55:29 +08:00
ranking_rules: Setting::NotSet,
stop_words: Setting::NotSet,
synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet,
typo_tolerance: Setting::NotSet,
2021-05-11 00:34:25 +08:00
_kind: PhantomData::<Unchecked>,
};
let checked = settings.clone().check();
assert_eq!(settings.displayed_attributes, checked.displayed_attributes);
2021-05-12 23:04:24 +08:00
assert_eq!(
settings.searchable_attributes,
checked.searchable_attributes
);
2021-05-11 00:34:25 +08:00
// test wildcard
// test no changes
let settings = Settings {
2021-08-25 02:55:29 +08:00
displayed_attributes: Setting::Set(vec![String::from("*")]),
searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]),
filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet,
2021-08-25 02:55:29 +08:00
ranking_rules: Setting::NotSet,
stop_words: Setting::NotSet,
synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet,
typo_tolerance: Setting::NotSet,
2021-05-11 00:34:25 +08:00
_kind: PhantomData::<Unchecked>,
};
let checked = settings.check();
2021-08-25 02:55:29 +08:00
assert_eq!(checked.displayed_attributes, Setting::Reset);
assert_eq!(checked.searchable_attributes, Setting::Reset);
2021-05-11 00:34:25 +08:00
}
}