mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 10:37:41 +08:00
Make the tokenizer understand strange whitespaces/quotes
This commit is contained in:
parent
ccded7b429
commit
de2b8672d4
@ -5,4 +5,5 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
|
|||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
deunicode = "1.0.0"
|
||||||
slice-group-by = "0.2.4"
|
slice-group-by = "0.2.4"
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use self::SeparatorCategory::*;
|
use self::SeparatorCategory::*;
|
||||||
|
use deunicode::deunicode_char;
|
||||||
use slice_group_by::StrGroupBy;
|
use slice_group_by::StrGroupBy;
|
||||||
use std::iter::Peekable;
|
use std::iter::Peekable;
|
||||||
|
|
||||||
@ -43,7 +44,10 @@ fn is_separator(c: char) -> bool {
|
|||||||
|
|
||||||
fn classify_separator(c: char) -> Option<SeparatorCategory> {
|
fn classify_separator(c: char) -> Option<SeparatorCategory> {
|
||||||
match c {
|
match c {
|
||||||
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
|
c if c.is_whitespace() => Some(Soft), // whitespaces
|
||||||
|
c if deunicode_char(c) == Some("'") => Some(Soft), // quotes
|
||||||
|
c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes
|
||||||
|
'-' | '_' | '\'' | ':' => Some(Soft),
|
||||||
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
|
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user