2259: disable typos on words r=MarinPostma a=MarinPostma

Introduce the disable typo setting as per https://github.com/meilisearch/specifications/pull/117.

waiting for https://github.com/meilisearch/milli/pull/474.


Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
bors[bot] 2022-04-06 17:40:08 +00:00 committed by GitHub
commit c321ac61b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 77 additions and 7 deletions

View File

@ -120,6 +120,7 @@ pub enum Code {
IndexAlreadyExists, IndexAlreadyExists,
IndexNotFound, IndexNotFound,
InvalidIndexUid, InvalidIndexUid,
InvalidMinWordLengthForTypo,
// invalid state error // invalid state error
InvalidState, InvalidState,
@ -271,6 +272,9 @@ impl Code {
InvalidApiKeyDescription => { InvalidApiKeyDescription => {
ErrCode::invalid("invalid_api_key_description", StatusCode::BAD_REQUEST) ErrCode::invalid("invalid_api_key_description", StatusCode::BAD_REQUEST)
} }
InvalidMinWordLengthForTypo => {
ErrCode::invalid("invalid_min_word_length_for_typo", StatusCode::BAD_REQUEST)
}
} }
} }

View File

@ -41,7 +41,9 @@ impl ErrorCode for MilliError<'_> {
UserError::CriterionError(_) => Code::InvalidRankingRule, UserError::CriterionError(_) => Code::InvalidRankingRule,
UserError::InvalidGeoField { .. } => Code::InvalidGeoField, UserError::InvalidGeoField { .. } => Code::InvalidGeoField,
UserError::SortError(_) => Code::Sort, UserError::SortError(_) => Code::Sort,
UserError::InvalidMinTypoWordLenSetting(_, _) => unreachable!(), UserError::InvalidMinTypoWordLenSetting(_, _) => {
Code::InvalidMinWordLengthForTypo
}
} }
} }
} }

View File

@ -5,6 +5,7 @@ use std::ops::Deref;
use std::path::Path; use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use fst::IntoStreamer;
use milli::heed::{EnvOpenOptions, RoTxn}; use milli::heed::{EnvOpenOptions, RoTxn};
use milli::update::{IndexerConfig, Setting}; use milli::update::{IndexerConfig, Setting};
use milli::{obkv_to_json, FieldDistribution, FieldId}; use milli::{obkv_to_json, FieldDistribution, FieldId};
@ -17,7 +18,7 @@ use crate::EnvSizer;
use super::error::IndexError; use super::error::IndexError;
use super::error::Result; use super::error::Result;
use super::updates::TypoSettings; use super::updates::{MinWordLengthTypoSetting, TypoSettings};
use super::{Checked, Settings}; use super::{Checked, Settings};
pub type Document = Map<String, Value>; pub type Document = Map<String, Value>;
@ -169,8 +170,22 @@ impl Index {
}) })
.collect(); .collect();
let min_typo_word_len = MinWordLengthTypoSetting {
one_typo: Setting::Set(self.min_word_len_one_typo(txn)?),
two_typos: Setting::Set(self.min_word_len_two_typos(txn)?),
};
let disabled_words = self
.exact_words(txn)?
.into_stream()
.into_strs()?
.into_iter()
.collect();
let typo_tolerance = TypoSettings { let typo_tolerance = TypoSettings {
enabled: Setting::Set(self.authorize_typos(txn)?), enabled: Setting::Set(self.authorize_typos(txn)?),
min_word_length_for_typo: Setting::Set(min_typo_word_len),
disable_on_words: Setting::Set(disabled_words),
}; };
Ok(Settings { Ok(Settings {

View File

@ -37,14 +37,33 @@ pub struct Checked;
#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq)] #[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq)]
pub struct Unchecked; pub struct Unchecked;
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct MinWordLengthTypoSetting {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub one_typo: Setting<u8>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub two_typos: Setting<u8>,
}
#[cfg_attr(test, derive(proptest_derive::Arbitrary))] #[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct TypoSettings { pub struct TypoSettings {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")] #[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub enabled: Setting<bool>, pub enabled: Setting<bool>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub min_word_length_for_typo: Setting<MinWordLengthTypoSetting>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub disable_on_words: Setting<BTreeSet<String>>,
} }
/// Holds all the settings for an index. `T` can either be `Checked` if they represents settings /// Holds all the settings for an index. `T` can either be `Checked` if they represents settings
/// whose validity is guaranteed, or `Unchecked` if they need to be validated. In the later case, a /// whose validity is guaranteed, or `Unchecked` if they need to be validated. In the later case, a
@ -352,14 +371,44 @@ pub fn apply_settings_to_builder(
} }
match settings.typo { match settings.typo {
Setting::Set(ref value) => match value.enabled { Setting::Set(ref value) => {
Setting::Set(val) => builder.set_autorize_typos(val), match value.enabled {
Setting::Reset => builder.reset_authorize_typos(), Setting::Set(val) => builder.set_autorize_typos(val),
Setting::NotSet => (), Setting::Reset => builder.reset_authorize_typos(),
}, Setting::NotSet => (),
}
match value.min_word_length_for_typo {
Setting::Set(ref setting) => {
match setting.one_typo {
Setting::Set(val) => builder.set_min_word_len_one_typo(val),
Setting::Reset => builder.reset_min_word_len_one_typo(),
Setting::NotSet => (),
}
match setting.two_typos {
Setting::Set(val) => builder.set_min_word_len_two_typos(val),
Setting::Reset => builder.reset_min_word_len_two_typos(),
Setting::NotSet => (),
}
}
Setting::Reset => {
builder.reset_min_word_len_one_typo();
builder.reset_min_word_len_two_typos();
}
Setting::NotSet => (),
}
match value.disable_on_words {
Setting::Set(ref words) => {
builder.set_exact_words(words.clone());
}
Setting::Reset => builder.reset_exact_words(),
Setting::NotSet => (),
}
}
Setting::Reset => { Setting::Reset => {
// all typo settings need to be reset here. // all typo settings need to be reset here.
builder.reset_authorize_typos(); builder.reset_authorize_typos();
builder.reset_min_word_len_one_typo();
builder.reset_min_word_len_two_typos();
} }
Setting::NotSet => (), Setting::NotSet => (),
} }