mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 10:37:41 +08:00
Make the tokenizer understand strange whitespaces/quotes
This commit is contained in:
parent
ccded7b429
commit
de2b8672d4
@ -5,4 +5,5 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
deunicode = "1.0.0"
|
||||
slice-group-by = "0.2.4"
|
||||
|
@ -1,4 +1,5 @@
|
||||
use self::SeparatorCategory::*;
|
||||
use deunicode::deunicode_char;
|
||||
use slice_group_by::StrGroupBy;
|
||||
use std::iter::Peekable;
|
||||
|
||||
@ -43,7 +44,10 @@ fn is_separator(c: char) -> bool {
|
||||
|
||||
fn classify_separator(c: char) -> Option<SeparatorCategory> {
|
||||
match c {
|
||||
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
|
||||
c if c.is_whitespace() => Some(Soft), // whitespaces
|
||||
c if deunicode_char(c) == Some("'") => Some(Soft), // quotes
|
||||
c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes
|
||||
'-' | '_' | '\'' | ':' => Some(Soft),
|
||||
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
|
||||
_ => None,
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user