Make the tokenizer understand strange whitespaces/quotes

This commit is contained in:
Clément Renault 2019-11-04 16:10:13 +01:00
parent ccded7b429
commit de2b8672d4
2 changed files with 6 additions and 1 deletions

View File

@ -5,4 +5,5 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018" edition = "2018"
[dependencies] [dependencies]
deunicode = "1.0.0"
slice-group-by = "0.2.4" slice-group-by = "0.2.4"

View File

@ -1,4 +1,5 @@
use self::SeparatorCategory::*; use self::SeparatorCategory::*;
use deunicode::deunicode_char;
use slice_group_by::StrGroupBy; use slice_group_by::StrGroupBy;
use std::iter::Peekable; use std::iter::Peekable;
@ -43,7 +44,10 @@ fn is_separator(c: char) -> bool {
fn classify_separator(c: char) -> Option<SeparatorCategory> { fn classify_separator(c: char) -> Option<SeparatorCategory> {
match c { match c {
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft), c if c.is_whitespace() => Some(Soft), // whitespaces
c if deunicode_char(c) == Some("'") => Some(Soft), // quotes
c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes
'-' | '_' | '\'' | ':' => Some(Soft),
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard), '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
_ => None, _ => None,
} }