mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 02:27:40 +08:00
Merge pull request #117 from Kerollmops/tokenizer-support-parentheses
Make the tokenizer support parentheses
This commit is contained in:
commit
12a352ae2f
@ -75,9 +75,9 @@ impl Separator {
|
|||||||
|
|
||||||
fn detect_separator(c: char) -> Option<Separator> {
|
fn detect_separator(c: char) -> Option<Separator> {
|
||||||
match c {
|
match c {
|
||||||
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
|
'.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Long),
|
||||||
' ' | '\'' | '"' => Some(Short),
|
' ' | '\'' | '"' => Some(Short),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,11 +150,12 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn hard() {
|
fn hard() {
|
||||||
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
|
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
|
||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
||||||
|
Loading…
Reference in New Issue
Block a user