From b5503989f912ef4141ce2ed4bda503802d8b16b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Sep 2019 16:54:54 +0200 Subject: [PATCH] feat: Improve the tokenizer by split after deunicode --- meilidb-data/src/indexer.rs | 112 ++++++++++++++++++++++++++++-------- 1 file changed, 89 insertions(+), 23 deletions(-) diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs index a4dc52f8a..591ddd705 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-data/src/indexer.rs @@ -34,7 +34,43 @@ impl Indexer { } pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { - for token in Tokenizer::new(text) { + let lowercase_text = text.to_lowercase(); + let deunicoded = deunicode_with_tofu(&lowercase_text, ""); + + // TODO compute the deunicoded version after the cjk check + let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded { + Some(deunicoded) + } else { + None + }; + let iter = Some(lowercase_text).into_iter().chain(next); + + for text in iter { + for token in Tokenizer::new(&text) { + let must_continue = index_token( + token, + id, + attr, + self.word_limit, + &mut self.words_doc_indexes, + &mut self.docs_words, + ); + + if !must_continue { break } + } + } + } + + pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) + where I: IntoIterator, + IT: Iterator + Clone, + { + // TODO serialize this to one call to the SeqTokenizer loop + + let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect(); + let iter = lowercased.iter().map(|t| t.as_str()); + + for token in SeqTokenizer::new(iter) { let must_continue = index_token( token, id, @@ -46,12 +82,14 @@ impl Indexer { if !must_continue { break } } - } - pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) - where I: IntoIterator, - { - let iter = iter.into_iter(); + let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| { + if lowercase_text.contains(is_cjk) { return lowercase_text } + let deunicoded = deunicode_with_tofu(&lowercase_text, ""); + if lowercase_text != deunicoded { deunicoded } else { lowercase_text } + }).collect(); + let iter = deunicoded.iter().map(|t| t.as_str()); + for token in SeqTokenizer::new(iter) { let must_continue = index_token( token, @@ -96,8 +134,6 @@ fn index_token( { if token.word_index >= word_limit { return false } - let lower = token.word.to_lowercase(); - let token = Token { word: &lower, ..token }; match token_to_docindex(id, attr, token) { Some(docindex) => { let word = Vec::from(token.word); @@ -107,21 +143,6 @@ fn index_token( None => return false, } - if !lower.contains(is_cjk) { - let unidecoded = deunicode_with_tofu(&lower, ""); - if unidecoded != lower { - let token = Token { word: &unidecoded, ..token }; - match token_to_docindex(id, attr, token) { - Some(docindex) => { - let word = Vec::from(token.word); - words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex); - docs_words.entry(id).or_insert_with(Vec::new).push(word); - }, - None => return false, - } - } - } - true } @@ -140,3 +161,48 @@ fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option