meilisearch/meilidb-tokenizer/src/lib.rs

use std::mem;
use slice_group_by::LinearStrGroupBy;
use self::Separator::*;

pub fn is_cjk(c: char) -> bool {
    (c >= '\u{2e80}' && c <= '\u{2eff}') ||
    (c >= '\u{2f00}' && c <= '\u{2fdf}') ||
    (c >= '\u{3040}' && c <= '\u{309f}') ||
    (c >= '\u{30a0}' && c <= '\u{30ff}') ||
    (c >= '\u{3100}' && c <= '\u{312f}') ||
    (c >= '\u{3200}' && c <= '\u{32ff}') ||
    (c >= '\u{3400}' && c <= '\u{4dbf}') ||
    (c >= '\u{4e00}' && c <= '\u{9fff}') ||
    (c >= '\u{f900}' && c <= '\u{faff}')
}

#[derive(Debug, PartialEq, Eq)]
enum CharCategory {
    Space,
    Cjk,
    Other,
}

fn classify_char(c: char) -> CharCategory {
    if c.is_whitespace() { CharCategory::Space }
    else if is_cjk(c) { CharCategory::Cjk }
    else { CharCategory::Other }
}

fn is_word(s: &&str) -> bool {
    !s.chars().any(char::is_whitespace)
}

fn same_group_category(a: char, b: char) -> bool {
    let ca = classify_char(a);
    let cb = classify_char(b);
    if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
}

pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
    LinearStrGroupBy::new(query, same_group_category).filter(is_word)
}

pub trait TokenizerBuilder {
    fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
}

pub struct DefaultBuilder;

impl DefaultBuilder {
    pub fn new() -> DefaultBuilder {
        DefaultBuilder
    }
}

#[derive(Debug, PartialEq, Eq)]
pub struct Token<'a> {
    pub word: &'a str,
    pub word_index: usize,
    pub char_index: usize,
}

impl TokenizerBuilder for DefaultBuilder {
    fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
        Box::new(Tokenizer::new(text))
    }
}

pub struct Tokenizer<'a> {
    word_index: usize,
    char_index: usize,
    inner: &'a str,
}

impl<'a> Tokenizer<'a> {
    pub fn new(string: &str) -> Tokenizer {
        let mut char_advance = 0;
        let mut index_advance = 0;
        for (n, (i, c)) in string.char_indices().enumerate() {
            char_advance = n;
            index_advance = i;
            if detect_separator(c).is_none() { break }
        }

        Tokenizer {
            word_index: 0,
            char_index: char_advance,
            inner: &string[index_advance..],
        }
    }
}

#[derive(Debug, Clone, Copy)]
enum Separator {
    Short,
    Long,
}

impl Separator {
    fn add(self, add: Separator) -> Separator {
        match (self, add) {
            (_,     Long)  => Long,
            (Short, Short) => Short,
            (Long,  Short) => Long,
        }
    }

    fn to_usize(self) -> usize {
        match self {
            Short => 1,
            Long => 8,
        }
    }
}

fn detect_separator(c: char) -> Option<Separator> {
    match c {
        '.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Long),
        ' ' | '\'' | '"' => Some(Short),
        _                => None,
    }
}

impl<'a> Iterator for Tokenizer<'a> {
    type Item = Token<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        let mut start_word = None;
        let mut distance = None;

        for (i, c) in self.inner.char_indices() {
            match detect_separator(c) {
                Some(sep) => {
                    if let Some(start_word) = start_word {
                        let (prefix, tail) = self.inner.split_at(i);
                        let (spaces, word) = prefix.split_at(start_word);

                        self.inner = tail;
                        self.char_index += spaces.chars().count();
                        self.word_index += distance.map(Separator::to_usize).unwrap_or(0);

                        let token = Token {
                            word: word,
                            word_index: self.word_index,
                            char_index: self.char_index,
                        };

                        self.char_index += word.chars().count();
                        return Some(token)
                    }

                    distance = Some(distance.map_or(sep, |s| s.add(sep)));
                },
                None => {
                    // if this is a Chinese, a Japanese or a Korean character
                    // See <http://unicode-table.com>
                    if is_cjk(c) {
                        match start_word {
                            Some(start_word) => {
                                let (prefix, tail) = self.inner.split_at(i);
                                let (spaces, word) = prefix.split_at(start_word);

                                self.inner = tail;
                                self.char_index += spaces.chars().count();
                                self.word_index += distance.map(Separator::to_usize).unwrap_or(0);

                                let token = Token {
                                    word: word,
                                    word_index: self.word_index,
                                    char_index: self.char_index,
                                };

                                self.word_index += 1;
                                self.char_index += word.chars().count();

                                return Some(token)
                            },
                            None => {
                                let (prefix, tail) = self.inner.split_at(i + c.len_utf8());
                                let (spaces, word) = prefix.split_at(i);

                                self.inner = tail;
                                self.char_index += spaces.chars().count();
                                self.word_index += distance.map(Separator::to_usize).unwrap_or(0);

                                let token = Token {
                                    word: word,
                                    word_index: self.word_index,
                                    char_index: self.char_index,
                                };

                                if tail.chars().next().and_then(detect_separator).is_none() {
                                    self.word_index += 1;
                                }
                                self.char_index += 1;

                                return Some(token)
                            }
                        }
                    }

                    if start_word.is_none() { start_word = Some(i) }
                },
            }
        }

        if let Some(start_word) = start_word {
            let prefix = mem::replace(&mut self.inner, "");
            let (spaces, word) = prefix.split_at(start_word);

            let token = Token {
                word: word,
                word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
                char_index: self.char_index + spaces.chars().count(),
            };
            return Some(token)
        }

        None
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn easy() {
        let mut tokenizer = Tokenizer::new("salut");

        assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo    ");

        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn hard() {
        let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");

        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");

        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn hard_long_chars() {
        let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");

        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");

        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn hard_kanjis() {
        let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");

        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello    \u{2ec7}");

        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
        assert_eq!(tokenizer.next(), None);
    }
}
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`use std::mem;`
feat: Move query splitting into the tokenizer workspace 2019-02-26 01:34:51 +08:00			`use slice_group_by::LinearStrGroupBy;`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`use self::Separator::*;`

feat: Move tokenizer things into the meilidb-tokenizer workspace 2019-02-26 01:24:46 +08:00			`pub fn is_cjk(c: char) -> bool {`
			`(c >= '\u{2e80}' && c <= '\u{2eff}') \|\|`
			`(c >= '\u{2f00}' && c <= '\u{2fdf}') \|\|`
			`(c >= '\u{3040}' && c <= '\u{309f}') \|\|`
			`(c >= '\u{30a0}' && c <= '\u{30ff}') \|\|`
			`(c >= '\u{3100}' && c <= '\u{312f}') \|\|`
			`(c >= '\u{3200}' && c <= '\u{32ff}') \|\|`
			`(c >= '\u{3400}' && c <= '\u{4dbf}') \|\|`
			`(c >= '\u{4e00}' && c <= '\u{9fff}') \|\|`
			`(c >= '\u{f900}' && c <= '\u{faff}')`
			`}`

feat: Move query splitting into the tokenizer workspace 2019-02-26 01:34:51 +08:00			`#[derive(Debug, PartialEq, Eq)]`
			`enum CharCategory {`
			`Space,`
			`Cjk,`
			`Other,`
			`}`

			`fn classify_char(c: char) -> CharCategory {`
			`if c.is_whitespace() { CharCategory::Space }`
			`else if is_cjk(c) { CharCategory::Cjk }`
			`else { CharCategory::Other }`
			`}`

			`fn is_word(s: &&str) -> bool {`
			`!s.chars().any(char::is_whitespace)`
			`}`

			`fn same_group_category(a: char, b: char) -> bool {`
			`let ca = classify_char(a);`
			`let cb = classify_char(b);`
			`if ca == CharCategory::Cjk \|\| cb == CharCategory::Cjk { false } else { ca == cb }`
			`}`

			`pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {`
			`LinearStrGroupBy::new(query, same_group_category).filter(is_word)`
			`}`

feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`pub trait TokenizerBuilder {`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`}`

feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`pub struct DefaultBuilder;`

			`impl DefaultBuilder {`
			`pub fn new() -> DefaultBuilder {`
			`DefaultBuilder`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`}`
feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`}`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`#[derive(Debug, PartialEq, Eq)]`
			`pub struct Token<'a> {`
			`pub word: &'a str,`
			`pub word_index: usize,`
			`pub char_index: usize,`
			`}`

feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`impl TokenizerBuilder for DefaultBuilder {`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {`
feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`Box::new(Tokenizer::new(text))`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`}`
			`}`

feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`pub struct Tokenizer<'a> {`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`word_index: usize,`
			`char_index: usize,`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`inner: &'a str,`
			`}`

feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`impl<'a> Tokenizer<'a> {`
			`pub fn new(string: &str) -> Tokenizer {`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`let mut char_advance = 0;`
			`let mut index_advance = 0;`
			`for (n, (i, c)) in string.char_indices().enumerate() {`
			`char_advance = n;`
			`index_advance = i;`
			`if detect_separator(c).is_none() { break }`
			`}`

feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`Tokenizer {`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`word_index: 0,`
			`char_index: char_advance,`
			`inner: &string[index_advance..],`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`}`
			`}`
			`}`

			`#[derive(Debug, Clone, Copy)]`
			`enum Separator {`
			`Short,`
			`Long,`
			`}`

			`impl Separator {`
			`fn add(self, add: Separator) -> Separator {`
			`match (self, add) {`
			`(_, Long) => Long,`
			`(Short, Short) => Short,`
			`(Long, Short) => Long,`
			`}`
			`}`

			`fn to_usize(self) -> usize {`
			`match self {`
			`Short => 1,`
			`Long => 8,`
			`}`
			`}`
			`}`

feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`fn detect_separator(c: char) -> Option<Separator> {`
			`match c {`
feat: Make the tokenizer support parentheses Interpreting them as hard ponctuation (like a dot). 2019-02-22 22:40:39 +08:00			`'.' \| ';' \| ',' \| '!' \| '?' \| '-' \| '(' \| ')' => Some(Long),`
			`' ' \| '\'' \| '"' => Some(Short),`
			`_ => None,`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`}`
			`}`

feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`impl<'a> Iterator for Tokenizer<'a> {`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`type Item = Token<'a>;`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00
			`fn next(&mut self) -> Option<Self::Item> {`
			`let mut start_word = None;`
			`let mut distance = None;`

			`for (i, c) in self.inner.char_indices() {`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`match detect_separator(c) {`
			`Some(sep) => {`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`if let Some(start_word) = start_word {`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`let (prefix, tail) = self.inner.split_at(i);`
			`let (spaces, word) = prefix.split_at(start_word);`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00
			`self.inner = tail;`
feat: Make WordArea be based on char index and length 2019-01-10 03:14:08 +08:00			`self.char_index += spaces.chars().count();`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`self.word_index += distance.map(Separator::to_usize).unwrap_or(0);`

			`let token = Token {`
			`word: word,`
			`word_index: self.word_index,`
			`char_index: self.char_index,`
			`};`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00
feat: Make WordArea be based on char index and length 2019-01-10 03:14:08 +08:00			`self.char_index += word.chars().count();`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`return Some(token)`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`}`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00
feat: Make the Tokenizer support Kanjis 2019-02-23 01:17:43 +08:00			`distance = Some(distance.map_or(sep, \|s\| s.add(sep)));`
			`},`
			`None => {`
			`// if this is a Chinese, a Japanese or a Korean character`
			`// See <http://unicode-table.com>`
feat: Make query strings support cjk kanjis 2019-02-23 21:57:13 +08:00			`if is_cjk(c) {`
feat: Make the Tokenizer support Kanjis 2019-02-23 01:17:43 +08:00			`match start_word {`
			`Some(start_word) => {`
			`let (prefix, tail) = self.inner.split_at(i);`
			`let (spaces, word) = prefix.split_at(start_word);`

			`self.inner = tail;`
			`self.char_index += spaces.chars().count();`
			`self.word_index += distance.map(Separator::to_usize).unwrap_or(0);`

			`let token = Token {`
			`word: word,`
			`word_index: self.word_index,`
			`char_index: self.char_index,`
			`};`

			`self.word_index += 1;`
			`self.char_index += word.chars().count();`

			`return Some(token)`
			`},`
			`None => {`
fix: Change the tokenizer to mesure cjk chars positions 2019-02-23 06:06:42 +08:00			`let (prefix, tail) = self.inner.split_at(i + c.len_utf8());`
feat: Make the Tokenizer support Kanjis 2019-02-23 01:17:43 +08:00			`let (spaces, word) = prefix.split_at(i);`

			`self.inner = tail;`
			`self.char_index += spaces.chars().count();`
			`self.word_index += distance.map(Separator::to_usize).unwrap_or(0);`

			`let token = Token {`
			`word: word,`
			`word_index: self.word_index,`
			`char_index: self.char_index,`
			`};`

			`if tail.chars().next().and_then(detect_separator).is_none() {`
			`self.word_index += 1;`
			`}`
fix: Change the tokenizer to mesure cjk chars positions 2019-02-23 06:06:42 +08:00			`self.char_index += 1;`
feat: Make the Tokenizer support Kanjis 2019-02-23 01:17:43 +08:00
			`return Some(token)`
			`}`
			`}`
			`}`

			`if start_word.is_none() { start_word = Some(i) }`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`},`
			`}`
			`}`

			`if let Some(start_word) = start_word {`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`let prefix = mem::replace(&mut self.inner, "");`
			`let (spaces, word) = prefix.split_at(start_word);`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`let token = Token {`
			`word: word,`
			`word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),`
feat: Make WordArea be based on char index and length 2019-01-10 03:14:08 +08:00			`char_index: self.char_index + spaces.chars().count(),`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`};`
			`return Some(token)`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`}`

			`None`
			`}`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn easy() {`
feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`let mut tokenizer = Tokenizer::new("salut");`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));`
feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`assert_eq!(tokenizer.next(), None);`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00
feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`let mut tokenizer = Tokenizer::new("yo ");`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));`
feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`assert_eq!(tokenizer.next(), None);`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`}`

			`#[test]`
			`fn hard() {`
feat: Make the tokenizer support parentheses Interpreting them as hard ponctuation (like a dot). 2019-02-22 22:40:39 +08:00			`let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");`
feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));`
feat: Make the tokenizer support parentheses Interpreting them as hard ponctuation (like a dot). 2019-02-22 22:40:39 +08:00			`assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));`
feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`assert_eq!(tokenizer.next(), None);`

			`let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");`

feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));`
			`assert_eq!(tokenizer.next(), None);`
			`}`

			`#[test]`
			`fn hard_long_chars() {`
			`let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");`

			`assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));`
feat: Make WordArea be based on char index and length 2019-01-10 03:14:08 +08:00			`assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`assert_eq!(tokenizer.next(), None);`

			`let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");`

			`assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));`
feat: Make WordArea be based on char index and length 2019-01-10 03:14:08 +08:00			`assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));`
feat: Introduce an Index system based on RocksDB 2018-11-16 00:55:20 +08:00			`assert_eq!(tokenizer.next(), None);`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`}`
feat: Make the Tokenizer support Kanjis 2019-02-23 01:17:43 +08:00
			`#[test]`
			`fn hard_kanjis() {`
			`let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");`

			`assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));`
fix: Change the tokenizer to mesure cjk chars positions 2019-02-23 06:06:42 +08:00			`assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));`
feat: Make the Tokenizer support Kanjis 2019-02-23 01:17:43 +08:00			`assert_eq!(tokenizer.next(), None);`

			`let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");`

			`assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));`
fix: Change the tokenizer to mesure cjk chars positions 2019-02-23 06:06:42 +08:00			`assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));`
			`assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));`
feat: Make the Tokenizer support Kanjis 2019-02-23 01:17:43 +08:00			`assert_eq!(tokenizer.next(), None);`
			`}`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`}`