From de2b8672d48e4d75444cd34e321ba8d14f7e6465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 4 Nov 2019 16:10:13 +0100 Subject: [PATCH] Make the tokenizer understand strange whitespaces/quotes --- meilidb-tokenizer/Cargo.toml | 1 + meilidb-tokenizer/src/lib.rs | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml index 06a3be9ad..97220ef24 100644 --- a/meilidb-tokenizer/Cargo.toml +++ b/meilidb-tokenizer/Cargo.toml @@ -5,4 +5,5 @@ authors = ["Kerollmops "] edition = "2018" [dependencies] +deunicode = "1.0.0" slice-group-by = "0.2.4" diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs index 106d0f91f..6bbcbea3d 100644 --- a/meilidb-tokenizer/src/lib.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,4 +1,5 @@ use self::SeparatorCategory::*; +use deunicode::deunicode_char; use slice_group_by::StrGroupBy; use std::iter::Peekable; @@ -43,7 +44,10 @@ fn is_separator(c: char) -> bool { fn classify_separator(c: char) -> Option { match c { - ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft), + c if c.is_whitespace() => Some(Soft), // whitespaces + c if deunicode_char(c) == Some("'") => Some(Soft), // quotes + c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes + '-' | '_' | '\'' | ':' => Some(Soft), '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard), _ => None, }