mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 02:27:40 +08:00
Merge #473
473: set minimum word len for typos r=MarinPostma a=MarinPostma this PR allows the configuration on the minimum word length for typos. The default values are the same as previously. ## steps - [x] introduce settings for the minimum word length for 1 and 2 typos - [x] update the settings update flow to set this setting - [x] create a structure `TypoConfig` to configure typo tolerance in the query builder - [x] in `typo`, use the configuration to create the appropriate query tree node. - [x] extend `Context` to return the setting for minimum word length for typos - [x] return correct error message for wrong settings. - [x] merge #469 Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
commit
48a5ce7434
@ -72,6 +72,7 @@ pub enum UserError {
|
||||
SerdeJson(serde_json::Error),
|
||||
SortError(SortError),
|
||||
UnknownInternalDocumentId { document_id: DocumentId },
|
||||
InvalidMinTypoWordLenSetting(u8, u8),
|
||||
}
|
||||
|
||||
impl From<io::Error> for Error {
|
||||
@ -291,6 +292,7 @@ ranking rules settings to use the sort parameter at search time.",
|
||||
Self::UnknownInternalDocumentId { document_id } => {
|
||||
write!(f, "An unknown internal document id have been used: `{}`.", document_id)
|
||||
}
|
||||
Self::InvalidMinTypoWordLenSetting(one, two) => write!(f, "`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {}` and twoTypos: {}`.", one, two),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -23,6 +23,9 @@ use crate::{
|
||||
Search, StrBEU32Codec, StrStrU8Codec, BEU32,
|
||||
};
|
||||
|
||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
|
||||
|
||||
pub mod main_key {
|
||||
pub const CRITERIA_KEY: &str = "criteria";
|
||||
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
|
||||
@ -47,6 +50,8 @@ pub mod main_key {
|
||||
pub const CREATED_AT_KEY: &str = "created-at";
|
||||
pub const UPDATED_AT_KEY: &str = "updated-at";
|
||||
pub const AUTHORIZE_TYPOS: &str = "authorize-typos";
|
||||
pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
|
||||
pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
|
||||
}
|
||||
|
||||
pub mod db_name {
|
||||
@ -886,6 +891,42 @@ impl Index {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn min_word_len_one_typo(&self, txn: &RoTxn) -> heed::Result<u8> {
|
||||
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
|
||||
// identify 0 as being false, and anything else as true. The absence of a value is true,
|
||||
// because by default, we authorize typos.
|
||||
Ok(self
|
||||
.main
|
||||
.get::<_, Str, OwnedType<u8>>(txn, main_key::ONE_TYPO_WORD_LEN)?
|
||||
.unwrap_or(DEFAULT_MIN_WORD_LEN_ONE_TYPO))
|
||||
}
|
||||
|
||||
pub(crate) fn put_min_word_len_one_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> {
|
||||
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
|
||||
// identify 0 as being false, and anything else as true. The absence of a value is true,
|
||||
// because by default, we authorize typos.
|
||||
self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::ONE_TYPO_WORD_LEN, &val)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn min_word_len_two_typos(&self, txn: &RoTxn) -> heed::Result<u8> {
|
||||
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
|
||||
// identify 0 as being false, and anything else as true. The absence of a value is true,
|
||||
// because by default, we authorize typos.
|
||||
Ok(self
|
||||
.main
|
||||
.get::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN)?
|
||||
.unwrap_or(DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
|
||||
}
|
||||
|
||||
pub(crate) fn put_min_word_len_two_typos(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> {
|
||||
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
|
||||
// identify 0 as being false, and anything else as true. The absence of a value is true,
|
||||
// because by default, we authorize typos.
|
||||
self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@ -896,6 +937,7 @@ pub(crate) mod tests {
|
||||
use maplit::btreemap;
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig};
|
||||
use crate::Index;
|
||||
|
||||
@ -1023,4 +1065,22 @@ pub(crate) mod tests {
|
||||
let txn = index.read_txn().unwrap();
|
||||
assert!(!index.authorize_typos(&txn).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_min_word_len_for_typos() {
|
||||
let index = TempIndex::new();
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
|
||||
assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO);
|
||||
assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS);
|
||||
|
||||
index.put_min_word_len_one_typo(&mut txn, 3).unwrap();
|
||||
index.put_min_word_len_two_typos(&mut txn, 15).unwrap();
|
||||
|
||||
txn.commit().unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 3);
|
||||
assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 15);
|
||||
}
|
||||
}
|
||||
|
@ -155,6 +155,8 @@ trait Context {
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
/// Returns the minimum word len for 1 and 2 typos.
|
||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
|
||||
}
|
||||
|
||||
/// The query tree builder is the interface to build a query tree.
|
||||
@ -178,6 +180,12 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
||||
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
||||
self.index.word_documents_count(self.rtxn, word)
|
||||
}
|
||||
|
||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
|
||||
let one = self.index.min_word_len_one_typo(&self.rtxn)?;
|
||||
let two = self.index.min_word_len_two_typos(&self.rtxn)?;
|
||||
Ok((one, two))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> QueryTreeBuilder<'a> {
|
||||
@ -256,14 +264,24 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
|
||||
Ok(best.map(|(_, left, right)| Operation::Phrase(vec![left.to_string(), right.to_string()])))
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TypoConfig {
|
||||
pub max_typos: u8,
|
||||
pub word_len_one_typo: u8,
|
||||
pub word_len_two_typo: u8,
|
||||
}
|
||||
|
||||
/// Return the `QueryKind` of a word depending on `authorize_typos`
|
||||
/// and the provided word length.
|
||||
fn typos(word: String, authorize_typos: bool, max_typos: u8) -> QueryKind {
|
||||
fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind {
|
||||
if authorize_typos {
|
||||
match word.chars().count() {
|
||||
0..=4 => QueryKind::exact(word),
|
||||
5..=8 => QueryKind::tolerant(1.min(max_typos), word),
|
||||
_ => QueryKind::tolerant(2.min(max_typos), word),
|
||||
let count = word.chars().count().min(u8::MAX as usize) as u8;
|
||||
if count < config.word_len_one_typo {
|
||||
QueryKind::exact(word)
|
||||
} else if count < config.word_len_two_typo {
|
||||
QueryKind::tolerant(1.min(config.max_typos), word)
|
||||
} else {
|
||||
QueryKind::tolerant(2.min(config.max_typos), word)
|
||||
}
|
||||
} else {
|
||||
QueryKind::exact(word)
|
||||
@ -314,9 +332,11 @@ fn create_query_tree(
|
||||
if let Some(child) = split_best_frequency(ctx, &word)? {
|
||||
children.push(child);
|
||||
}
|
||||
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
||||
let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo };
|
||||
children.push(Operation::Query(Query {
|
||||
prefix,
|
||||
kind: typos(word, authorize_typos, 2),
|
||||
kind: typos(word, authorize_typos, config),
|
||||
}));
|
||||
Ok(Operation::or(false, children))
|
||||
}
|
||||
@ -363,9 +383,13 @@ fn create_query_tree(
|
||||
.collect();
|
||||
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
|
||||
let concat = words.concat();
|
||||
let (word_len_one_typo, word_len_two_typo) =
|
||||
ctx.min_word_len_for_typo()?;
|
||||
let config =
|
||||
TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo };
|
||||
let query = Query {
|
||||
prefix: is_prefix,
|
||||
kind: typos(concat, authorize_typos, 1),
|
||||
kind: typos(concat, authorize_typos, config),
|
||||
};
|
||||
operations.push(Operation::Query(query));
|
||||
and_op_children.push(Operation::or(false, operations));
|
||||
@ -541,6 +565,7 @@ mod test {
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
use super::*;
|
||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TestContext {
|
||||
@ -576,6 +601,10 @@ mod test {
|
||||
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
||||
Ok(self.synonyms.get(&words).cloned())
|
||||
}
|
||||
|
||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
|
||||
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TestContext {
|
||||
@ -1193,4 +1222,24 @@ mod test {
|
||||
|
||||
assert_eq!(expected, query_tree);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_min_word_len_typo() {
|
||||
let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 };
|
||||
|
||||
assert_eq!(
|
||||
typos("hello".to_string(), true, config.clone()),
|
||||
QueryKind::Tolerant { typo: 1, word: "hello".to_string() }
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
typos("hell".to_string(), true, config.clone()),
|
||||
QueryKind::exact("hell".to_string())
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
typos("verylongword".to_string(), true, config.clone()),
|
||||
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -14,7 +14,7 @@ use crate::update::index_documents::IndexDocumentsMethod;
|
||||
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
|
||||
use crate::{FieldsIdsMap, Index, Result};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
#[derive(Debug, Clone, PartialEq, Copy)]
|
||||
pub enum Setting<T> {
|
||||
Set(T),
|
||||
Reset,
|
||||
@ -90,6 +90,8 @@ pub struct Settings<'a, 't, 'u, 'i> {
|
||||
synonyms: Setting<HashMap<String, Vec<String>>>,
|
||||
primary_key: Setting<String>,
|
||||
authorize_typos: Setting<bool>,
|
||||
min_word_len_two_typos: Setting<u8>,
|
||||
min_word_len_one_typo: Setting<u8>,
|
||||
}
|
||||
|
||||
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
@ -112,6 +114,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
primary_key: Setting::NotSet,
|
||||
authorize_typos: Setting::NotSet,
|
||||
indexer_config,
|
||||
min_word_len_two_typos: Setting::Reset,
|
||||
min_word_len_one_typo: Setting::Reset,
|
||||
}
|
||||
}
|
||||
|
||||
@ -196,6 +200,22 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
self.authorize_typos = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_min_word_len_two_typos(&mut self, val: u8) {
|
||||
self.min_word_len_two_typos = Setting::Set(val);
|
||||
}
|
||||
|
||||
pub fn reset_min_word_len_two_typos(&mut self) {
|
||||
self.min_word_len_two_typos = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_min_word_len_one_typo(&mut self, val: u8) {
|
||||
self.min_word_len_one_typo = Setting::Set(val);
|
||||
}
|
||||
|
||||
pub fn reset_min_word_len_one_typo(&mut self) {
|
||||
self.min_word_len_one_typo = Setting::Reset;
|
||||
}
|
||||
|
||||
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
@ -474,6 +494,38 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
}
|
||||
}
|
||||
|
||||
fn update_min_typo_word_len(&mut self) -> Result<()> {
|
||||
match (self.min_word_len_one_typo, self.min_word_len_two_typos) {
|
||||
(Setting::Set(one), Setting::Set(two)) => {
|
||||
if one > two {
|
||||
return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into());
|
||||
} else {
|
||||
self.index.put_min_word_len_one_typo(&mut self.wtxn, one)?;
|
||||
self.index.put_min_word_len_two_typos(&mut self.wtxn, two)?;
|
||||
}
|
||||
}
|
||||
(Setting::Set(one), _) => {
|
||||
let two = self.index.min_word_len_two_typos(&self.wtxn)?;
|
||||
if one > two {
|
||||
return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into());
|
||||
} else {
|
||||
self.index.put_min_word_len_one_typo(&mut self.wtxn, one)?;
|
||||
}
|
||||
}
|
||||
(_, Setting::Set(two)) => {
|
||||
let one = self.index.min_word_len_one_typo(&self.wtxn)?;
|
||||
if one > two {
|
||||
return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into());
|
||||
} else {
|
||||
self.index.put_min_word_len_two_typos(&mut self.wtxn, two)?;
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
@ -490,6 +542,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
self.update_criteria()?;
|
||||
self.update_primary_key()?;
|
||||
self.update_authorize_typos()?;
|
||||
self.update_min_typo_word_len()?;
|
||||
|
||||
// If there is new faceted fields we indicate that we must reindex as we must
|
||||
// index new fields as facets. It means that the distinct attribute,
|
||||
@ -1233,4 +1286,37 @@ mod tests {
|
||||
builder.execute(|_| ()).unwrap();
|
||||
assert!(!index.authorize_typos(&txn).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn update_min_word_len_for_typo() {
|
||||
let index = TempIndex::new();
|
||||
let config = IndexerConfig::default();
|
||||
|
||||
// Set the genres setting
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut txn, &index, &config);
|
||||
builder.set_min_word_len_one_typo(8);
|
||||
builder.set_min_word_len_two_typos(8);
|
||||
builder.execute(|_| ()).unwrap();
|
||||
|
||||
txn.commit().unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 8);
|
||||
assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn update_invalid_min_word_len_for_typo() {
|
||||
let index = TempIndex::new();
|
||||
let config = IndexerConfig::default();
|
||||
|
||||
// Set the genres setting
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut txn, &index, &config);
|
||||
builder.set_min_word_len_one_typo(10);
|
||||
builder.set_min_word_len_two_typos(7);
|
||||
assert!(builder.execute(|_| ()).is_err());
|
||||
}
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ mod distinct;
|
||||
mod filters;
|
||||
mod query_criteria;
|
||||
mod sort;
|
||||
mod typo_tolerance;
|
||||
|
||||
pub const TEST_QUERY: &'static str = "hello world america";
|
||||
|
||||
|
95
milli/tests/search/typo_tolerance.rs
Normal file
95
milli/tests/search/typo_tolerance.rs
Normal file
@ -0,0 +1,95 @@
|
||||
use milli::update::{IndexerConfig, Settings};
|
||||
use milli::{Criterion, Search};
|
||||
use Criterion::*;
|
||||
|
||||
#[test]
|
||||
fn test_typo_tolerance_one_typo() {
|
||||
let criteria = [Typo];
|
||||
let index = super::setup_search_index_with_criteria(&criteria);
|
||||
|
||||
// basic typo search with default typo settings
|
||||
{
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&txn, &index);
|
||||
search.query("zeal");
|
||||
search.limit(10);
|
||||
search.authorize_typos(true);
|
||||
search.optional_words(true);
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
|
||||
let mut search = Search::new(&txn, &index);
|
||||
search.query("zean");
|
||||
search.limit(10);
|
||||
search.authorize_typos(true);
|
||||
search.optional_words(true);
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 0);
|
||||
}
|
||||
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut txn, &index, &config);
|
||||
builder.set_min_word_len_one_typo(4);
|
||||
builder.execute(|_| ()).unwrap();
|
||||
|
||||
// typo is now supported for 4 letters words
|
||||
let mut search = Search::new(&txn, &index);
|
||||
search.query("zean");
|
||||
search.limit(10);
|
||||
search.authorize_typos(true);
|
||||
search.optional_words(true);
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_typo_tolerance_two_typo() {
|
||||
let criteria = [Typo];
|
||||
let index = super::setup_search_index_with_criteria(&criteria);
|
||||
|
||||
// basic typo search with default typo settings
|
||||
{
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&txn, &index);
|
||||
search.query("zealand");
|
||||
search.limit(10);
|
||||
search.authorize_typos(true);
|
||||
search.optional_words(true);
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
|
||||
let mut search = Search::new(&txn, &index);
|
||||
search.query("zealemd");
|
||||
search.limit(10);
|
||||
search.authorize_typos(true);
|
||||
search.optional_words(true);
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 0);
|
||||
}
|
||||
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut txn, &index, &config);
|
||||
builder.set_min_word_len_two_typos(7);
|
||||
builder.execute(|_| ()).unwrap();
|
||||
|
||||
// typo is now supported for 4 letters words
|
||||
let mut search = Search::new(&txn, &index);
|
||||
search.query("zealemd");
|
||||
search.limit(10);
|
||||
search.authorize_typos(true);
|
||||
search.optional_words(true);
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
}
|
Loading…
Reference in New Issue
Block a user