diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f5277a8fa..4e4fdc483 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" bstr = "1.0.1" byteorder = "1.4.3" -charabia = { version = "0.6.0", default-features = false } +charabia = { version = "0.7.0", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.6" either = "1.8.0" @@ -70,6 +70,10 @@ hebrew = ["charabia/hebrew"] # allow japanese specialized tokenization japanese = ["charabia/japanese"] +japanese-transliteration = ["charabia/japanese-transliteration"] + +# allow korean specialized tokenization +korean = ["charabia/korean"] # allow thai specialized tokenization thai = ["charabia/thai"] diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 25ee52ab1..6ac5123a8 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -14,14 +14,14 @@ const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; /// Structure used to build a Matcher allowing to customize formating tags. pub struct MatcherBuilder<'a, A> { matching_words: MatchingWords, - tokenizer: Tokenizer<'a, A>, + tokenizer: Tokenizer<'a, 'a, A>, crop_marker: Option, highlight_prefix: Option, highlight_suffix: Option, } impl<'a, A> MatcherBuilder<'a, A> { - pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, A>) -> Self { + pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { Self { matching_words, tokenizer, @@ -106,7 +106,7 @@ pub struct MatchBounds { pub struct Matcher<'t, 'm, A> { text: &'t str, matching_words: &'m MatchingWords, - tokenizer: &'m Tokenizer<'m, A>, + tokenizer: &'m Tokenizer<'m, 'm, A>, crop_marker: &'m str, highlight_prefix: &'m str, highlight_suffix: &'m str, diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index e689ae440..b5399f6e6 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -6,7 +6,7 @@ use std::hash::Hash; use std::rc::Rc; use std::{fmt, mem}; -use charabia::classifier::ClassifiedTokenIter; +use charabia::normalizer::NormalizedTokenIter; use charabia::{SeparatorKind, TokenKind}; use roaring::RoaringBitmap; use slice_group_by::GroupBy; @@ -270,7 +270,7 @@ impl<'a> QueryTreeBuilder<'a> { /// (the criterion `typo` will be ignored) pub fn build>( &self, - query: ClassifiedTokenIter, + query: NormalizedTokenIter, ) -> Result> { let primitive_query = create_primitive_query(query, self.words_limit); if !primitive_query.is_empty() { @@ -778,7 +778,7 @@ impl PrimitiveQueryPart { /// Create primitive query from tokenized query string, /// the primitive query is an intermediate state to build the query tree. fn create_primitive_query( - query: ClassifiedTokenIter, + query: NormalizedTokenIter, words_limit: Option, ) -> PrimitiveQuery where @@ -892,7 +892,7 @@ mod test { terms_matching_strategy: TermsMatchingStrategy, authorize_typos: bool, words_limit: Option, - query: ClassifiedTokenIter, + query: NormalizedTokenIter, ) -> Result> { let primitive_query = create_primitive_query(query, words_limit); if !primitive_query.is_empty() { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 9e55318ca..f912a756a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1575,11 +1575,11 @@ mod tests { let rtxn = index.read_txn().unwrap(); // Only the first document should match. - let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len(); + let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len(); assert_eq!(count, 1); // Only the second document should match. - let count = index.word_docids.get(&rtxn, "包").unwrap().unwrap().len(); + let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len(); assert_eq!(count, 1); let mut search = crate::Search::new(&rtxn, &index);