2018-09-27 22:32:17 +08:00
|
|
|
use std::mem;
|
|
|
|
use self::Separator::*;
|
|
|
|
|
2018-11-16 00:55:20 +08:00
|
|
|
pub trait TokenizerBuilder {
|
|
|
|
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>;
|
2018-09-27 22:32:17 +08:00
|
|
|
}
|
|
|
|
|
2018-11-16 00:55:20 +08:00
|
|
|
pub struct DefaultBuilder;
|
|
|
|
|
|
|
|
impl DefaultBuilder {
|
|
|
|
pub fn new() -> DefaultBuilder {
|
|
|
|
DefaultBuilder
|
2018-09-27 22:32:17 +08:00
|
|
|
}
|
2018-11-16 00:55:20 +08:00
|
|
|
}
|
2018-09-27 22:32:17 +08:00
|
|
|
|
2018-11-16 00:55:20 +08:00
|
|
|
impl TokenizerBuilder for DefaultBuilder {
|
|
|
|
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a> {
|
|
|
|
Box::new(Tokenizer::new(text))
|
2018-09-27 22:32:17 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-16 00:55:20 +08:00
|
|
|
pub struct Tokenizer<'a> {
|
2018-09-27 22:32:17 +08:00
|
|
|
index: usize,
|
|
|
|
inner: &'a str,
|
|
|
|
}
|
|
|
|
|
2018-11-16 00:55:20 +08:00
|
|
|
impl<'a> Tokenizer<'a> {
|
|
|
|
pub fn new(string: &str) -> Tokenizer {
|
|
|
|
Tokenizer {
|
2018-09-27 22:32:17 +08:00
|
|
|
index: 0,
|
2018-10-17 23:00:49 +08:00
|
|
|
inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]),
|
2018-09-27 22:32:17 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
|
|
enum Separator {
|
|
|
|
Short,
|
|
|
|
Long,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Separator {
|
|
|
|
fn add(self, add: Separator) -> Separator {
|
|
|
|
match (self, add) {
|
|
|
|
(_, Long) => Long,
|
|
|
|
(Short, Short) => Short,
|
|
|
|
(Long, Short) => Long,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn to_usize(self) -> usize {
|
|
|
|
match self {
|
|
|
|
Short => 1,
|
|
|
|
Long => 8,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-16 00:55:20 +08:00
|
|
|
impl<'a> Iterator for Tokenizer<'a> {
|
2018-09-27 22:32:17 +08:00
|
|
|
type Item = (usize, &'a str);
|
|
|
|
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
|
|
let mut start_word = None;
|
|
|
|
let mut distance = None;
|
|
|
|
|
|
|
|
for (i, c) in self.inner.char_indices() {
|
|
|
|
let separator = match c {
|
|
|
|
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
|
2018-10-17 23:00:49 +08:00
|
|
|
' ' | '\'' | '"' => Some(Short),
|
2018-09-27 22:32:17 +08:00
|
|
|
_ => None,
|
|
|
|
};
|
|
|
|
|
|
|
|
match separator {
|
|
|
|
Some(dist) => {
|
|
|
|
if let Some(start_word) = start_word {
|
|
|
|
let (word, tail) = self.inner.split_at(i);
|
|
|
|
|
|
|
|
self.inner = tail;
|
|
|
|
self.index += distance.map(Separator::to_usize).unwrap_or(0);
|
|
|
|
|
|
|
|
let word = &word[start_word..];
|
|
|
|
return Some((self.index, word))
|
|
|
|
}
|
|
|
|
distance = Some(distance.map(|s| s.add(dist)).unwrap_or(dist));
|
|
|
|
},
|
|
|
|
None => { start_word.get_or_insert(i); },
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Some(start_word) = start_word {
|
|
|
|
let word = mem::replace(&mut self.inner, "");
|
|
|
|
self.index += distance.map(Separator::to_usize).unwrap_or(0);
|
|
|
|
|
|
|
|
let word = &word[start_word..];
|
|
|
|
return Some((self.index, word))
|
|
|
|
}
|
|
|
|
|
|
|
|
None
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn easy() {
|
2018-11-16 00:55:20 +08:00
|
|
|
let mut tokenizer = Tokenizer::new("salut");
|
2018-09-27 22:32:17 +08:00
|
|
|
|
2018-11-16 00:55:20 +08:00
|
|
|
assert_eq!(tokenizer.next(), Some((0, "salut")));
|
|
|
|
assert_eq!(tokenizer.next(), None);
|
2018-09-27 22:32:17 +08:00
|
|
|
|
2018-11-16 00:55:20 +08:00
|
|
|
let mut tokenizer = Tokenizer::new("yo ");
|
2018-09-27 22:32:17 +08:00
|
|
|
|
2018-11-16 00:55:20 +08:00
|
|
|
assert_eq!(tokenizer.next(), Some((0, "yo")));
|
|
|
|
assert_eq!(tokenizer.next(), None);
|
2018-09-27 22:32:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn hard() {
|
2018-11-16 00:55:20 +08:00
|
|
|
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
|
|
|
|
|
|
|
|
assert_eq!(tokenizer.next(), Some((0, "yo")));
|
|
|
|
assert_eq!(tokenizer.next(), Some((1, "lolo")));
|
|
|
|
assert_eq!(tokenizer.next(), Some((9, "aïe")));
|
|
|
|
assert_eq!(tokenizer.next(), None);
|
|
|
|
|
|
|
|
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
|
|
|
|
|
|
|
assert_eq!(tokenizer.next(), Some((0, "yo")));
|
|
|
|
assert_eq!(tokenizer.next(), Some((8, "lolo")));
|
|
|
|
assert_eq!(tokenizer.next(), Some((16, "wtf")));
|
|
|
|
assert_eq!(tokenizer.next(), Some((24, "lol")));
|
|
|
|
assert_eq!(tokenizer.next(), Some((32, "aïe")));
|
|
|
|
assert_eq!(tokenizer.next(), None);
|
2018-09-27 22:32:17 +08:00
|
|
|
}
|
|
|
|
}
|