diff --git a/milli/src/error.rs b/milli/src/error.rs index e6fbc0605..688977741 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -72,6 +72,7 @@ pub enum UserError { SerdeJson(serde_json::Error), SortError(SortError), UnknownInternalDocumentId { document_id: DocumentId }, + InvalidMinTypoWordLenSetting(u8, u8), } impl From for Error { @@ -291,6 +292,7 @@ ranking rules settings to use the sort parameter at search time.", Self::UnknownInternalDocumentId { document_id } => { write!(f, "An unknown internal document id have been used: `{}`.", document_id) } + Self::InvalidMinTypoWordLenSetting(one, two) => write!(f, "`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {}` and twoTypos: {}`.", one, two), } } } diff --git a/milli/src/index.rs b/milli/src/index.rs index badcac0e5..853e7537d 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,6 +23,9 @@ use crate::{ Search, StrBEU32Codec, StrStrU8Codec, BEU32, }; +pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; +pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9; + pub mod main_key { pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; @@ -47,6 +50,8 @@ pub mod main_key { pub const CREATED_AT_KEY: &str = "created-at"; pub const UPDATED_AT_KEY: &str = "updated-at"; pub const AUTHORIZE_TYPOS: &str = "authorize-typos"; + pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; + pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; } pub mod db_name { @@ -886,6 +891,42 @@ impl Index { Ok(()) } + + pub fn min_word_len_one_typo(&self, txn: &RoTxn) -> heed::Result { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + Ok(self + .main + .get::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN)? + .unwrap_or(DEFAULT_MIN_WORD_LEN_ONE_TYPO)) + } + + pub(crate) fn put_min_word_len_one_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + self.main.put::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN, &val)?; + Ok(()) + } + + pub fn min_word_len_two_typos(&self, txn: &RoTxn) -> heed::Result { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + Ok(self + .main + .get::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN)? + .unwrap_or(DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) + } + + pub(crate) fn put_min_word_len_two_typos(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + self.main.put::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?; + Ok(()) + } } #[cfg(test)] @@ -896,6 +937,7 @@ pub(crate) mod tests { use maplit::btreemap; use tempfile::TempDir; + use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig}; use crate::Index; @@ -1023,4 +1065,22 @@ pub(crate) mod tests { let txn = index.read_txn().unwrap(); assert!(!index.authorize_typos(&txn).unwrap()); } + + #[test] + fn set_min_word_len_for_typos() { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS); + + index.put_min_word_len_one_typo(&mut txn, 3).unwrap(); + index.put_min_word_len_two_typos(&mut txn, 15).unwrap(); + + txn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 3); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 15); + } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 5437199e1..934d2fd9b 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -155,6 +155,8 @@ trait Context { None => Ok(None), } } + /// Returns the minimum word len for 1 and 2 typos. + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; } /// The query tree builder is the interface to build a query tree. @@ -178,6 +180,12 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn word_documents_count(&self, word: &str) -> heed::Result> { self.index.word_documents_count(self.rtxn, word) } + + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { + let one = self.index.min_word_len_one_typo(&self.rtxn)?; + let two = self.index.min_word_len_two_typos(&self.rtxn)?; + Ok((one, two)) + } } impl<'a> QueryTreeBuilder<'a> { @@ -256,14 +264,24 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result QueryKind { +fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { if authorize_typos { - match word.chars().count() { - 0..=4 => QueryKind::exact(word), - 5..=8 => QueryKind::tolerant(1.min(max_typos), word), - _ => QueryKind::tolerant(2.min(max_typos), word), + let count = word.chars().count().min(u8::MAX as usize) as u8; + if count < config.word_len_one_typo { + QueryKind::exact(word) + } else if count < config.word_len_two_typo { + QueryKind::tolerant(1.min(config.max_typos), word) + } else { + QueryKind::tolerant(2.min(config.max_typos), word) } } else { QueryKind::exact(word) @@ -314,9 +332,11 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } + let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; + let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo }; children.push(Operation::Query(Query { prefix, - kind: typos(word, authorize_typos, 2), + kind: typos(word, authorize_typos, config), })); Ok(Operation::or(false, children)) } @@ -363,9 +383,13 @@ fn create_query_tree( .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); + let (word_len_one_typo, word_len_two_typo) = + ctx.min_word_len_for_typo()?; + let config = + TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo }; let query = Query { prefix: is_prefix, - kind: typos(concat, authorize_typos, 1), + kind: typos(concat, authorize_typos, config), }; operations.push(Operation::Query(query)); and_op_children.push(Operation::or(false, operations)); @@ -541,6 +565,7 @@ mod test { use rand::{Rng, SeedableRng}; use super::*; + use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; #[derive(Debug)] struct TestContext { @@ -576,6 +601,10 @@ mod test { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) } + + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { + Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) + } } impl Default for TestContext { @@ -1193,4 +1222,24 @@ mod test { assert_eq!(expected, query_tree); } + + #[test] + fn test_min_word_len_typo() { + let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 }; + + assert_eq!( + typos("hello".to_string(), true, config.clone()), + QueryKind::Tolerant { typo: 1, word: "hello".to_string() } + ); + + assert_eq!( + typos("hell".to_string(), true, config.clone()), + QueryKind::exact("hell".to_string()) + ); + + assert_eq!( + typos("verylongword".to_string(), true, config.clone()), + QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() } + ); + } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 17924da8a..c03d6e0ae 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -14,7 +14,7 @@ use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::{FieldsIdsMap, Index, Result}; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Copy)] pub enum Setting { Set(T), Reset, @@ -90,6 +90,8 @@ pub struct Settings<'a, 't, 'u, 'i> { synonyms: Setting>>, primary_key: Setting, authorize_typos: Setting, + min_word_len_two_typos: Setting, + min_word_len_one_typo: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -112,6 +114,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, indexer_config, + min_word_len_two_typos: Setting::Reset, + min_word_len_one_typo: Setting::Reset, } } @@ -196,6 +200,22 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.authorize_typos = Setting::Reset; } + pub fn set_min_word_len_two_typos(&mut self, val: u8) { + self.min_word_len_two_typos = Setting::Set(val); + } + + pub fn reset_min_word_len_two_typos(&mut self) { + self.min_word_len_two_typos = Setting::Reset; + } + + pub fn set_min_word_len_one_typo(&mut self, val: u8) { + self.min_word_len_one_typo = Setting::Set(val); + } + + pub fn reset_min_word_len_one_typo(&mut self) { + self.min_word_len_one_typo = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -474,6 +494,38 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + fn update_min_typo_word_len(&mut self) -> Result<()> { + match (self.min_word_len_one_typo, self.min_word_len_two_typos) { + (Setting::Set(one), Setting::Set(two)) => { + if one > two { + return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); + } else { + self.index.put_min_word_len_one_typo(&mut self.wtxn, one)?; + self.index.put_min_word_len_two_typos(&mut self.wtxn, two)?; + } + } + (Setting::Set(one), _) => { + let two = self.index.min_word_len_two_typos(&self.wtxn)?; + if one > two { + return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); + } else { + self.index.put_min_word_len_one_typo(&mut self.wtxn, one)?; + } + } + (_, Setting::Set(two)) => { + let one = self.index.min_word_len_one_typo(&self.wtxn)?; + if one > two { + return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); + } else { + self.index.put_min_word_len_two_typos(&mut self.wtxn, two)?; + } + } + _ => (), + } + + Ok(()) + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -490,6 +542,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_criteria()?; self.update_primary_key()?; self.update_authorize_typos()?; + self.update_min_typo_word_len()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, @@ -1233,4 +1286,37 @@ mod tests { builder.execute(|_| ()).unwrap(); assert!(!index.authorize_typos(&txn).unwrap()); } + + #[test] + fn update_min_word_len_for_typo() { + let index = TempIndex::new(); + let config = IndexerConfig::default(); + + // Set the genres setting + let mut txn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_min_word_len_one_typo(8); + builder.set_min_word_len_two_typos(8); + builder.execute(|_| ()).unwrap(); + + txn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 8); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 8); + } + + #[test] + fn update_invalid_min_word_len_for_typo() { + let index = TempIndex::new(); + let config = IndexerConfig::default(); + + // Set the genres setting + let mut txn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_min_word_len_one_typo(10); + builder.set_min_word_len_two_typos(7); + assert!(builder.execute(|_| ()).is_err()); + } } diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 31d53b666..52b4c7114 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -16,6 +16,7 @@ mod distinct; mod filters; mod query_criteria; mod sort; +mod typo_tolerance; pub const TEST_QUERY: &'static str = "hello world america"; diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs new file mode 100644 index 000000000..00e6853cc --- /dev/null +++ b/milli/tests/search/typo_tolerance.rs @@ -0,0 +1,95 @@ +use milli::update::{IndexerConfig, Settings}; +use milli::{Criterion, Search}; +use Criterion::*; + +#[test] +fn test_typo_tolerance_one_typo() { + let criteria = [Typo]; + let index = super::setup_search_index_with_criteria(&criteria); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zeal"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + + let mut search = Search::new(&txn, &index); + search.query("zean"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_min_word_len_one_typo(4); + builder.execute(|_| ()).unwrap(); + + // typo is now supported for 4 letters words + let mut search = Search::new(&txn, &index); + search.query("zean"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); +} + +#[test] +fn test_typo_tolerance_two_typo() { + let criteria = [Typo]; + let index = super::setup_search_index_with_criteria(&criteria); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + + let mut search = Search::new(&txn, &index); + search.query("zealemd"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_min_word_len_two_typos(7); + builder.execute(|_| ()).unwrap(); + + // typo is now supported for 4 letters words + let mut search = Search::new(&txn, &index); + search.query("zealemd"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); +}