From 3b1cbed2385edcc776a2b84b6c484a9fe25114a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 4 Nov 2019 16:58:02 +0100 Subject: [PATCH] Check that the unidecoded words are not empty --- meilidb-core/src/raw_indexer.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 988a5182e..3e0f212f7 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -139,11 +139,12 @@ fn index_token( if !lower.contains(is_cjk) { let unidecoded = deunicode_with_tofu(&lower, ""); - if unidecoded != lower { + if unidecoded != lower && !unidecoded.is_empty() { let token = Token { word: &unidecoded, ..token }; + match token_to_docindex(id, attr, token) { Some(docindex) => { let word = Vec::from(token.word); @@ -252,4 +253,22 @@ mod tests { .get(&"éteindre".to_owned().into_bytes()) .is_some()); } + + #[test] + fn no_empty_unidecode() { + let mut indexer = RawIndexer::new(fst::Set::default()); + + let docid = DocumentId(0); + let attr = SchemaAttr(0); + let text = "🇯🇵"; + indexer.index_text(docid, attr, text); + + let Indexed { + words_doc_indexes, .. + } = indexer.build(); + + assert!(words_doc_indexes + .get(&"🇯🇵".to_owned().into_bytes()) + .is_some()); + } }