feat: Make the Tokenizer able to support tokenizing sequences

2025-02-20 01:27:52 +08:00 · 2019-03-18 14:42:59 +01:00 · 2019-03-18 14:42:59 +01:00 · abf7191eec
commit abf7191eec
parent c6bb2b6f9c
1 changed files with 66 additions and 0 deletions
--- a/meilidb-tokenizer/src/lib.rs
+++ b/meilidb-tokenizer/src/lib.rs
@ -1,3 +1,4 @@
 use std::iter::Peekable;
 use slice_group_by::StrGroupBy;
 use self::SeparatorCategory::*;
@ -151,6 +152,71 @@ impl<'a> Iterator for Tokenizer<'a> {
    }
 }
 pub struct SeqTokenizer<'a, I>
 where I: Iterator<Item=&'a str>,
 {
    inner: I,
    current: Option<Peekable<Tokenizer<'a>>>,
    word_offset: usize,
    char_offset: usize,
 }
 impl<'a, I> SeqTokenizer<'a, I>
 where I: Iterator<Item=&'a str>,
 {
    pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
        let current = iter.next().map(|s| Tokenizer::new(s).peekable());
        SeqTokenizer {
            inner: iter,
            current: current,
            word_offset: 0,
            char_offset: 0,
        }
    }
 }
 impl<'a, I> Iterator for SeqTokenizer<'a, I>
 where I: Iterator<Item=&'a str>,
 {
    type Item = Token<'a>;
    fn next(&mut self) -> Option<Self::Item> {
        match &mut self.current {
            Some(current) => {
                match current.next() {
                    Some(token) => {
                        // we must apply the word and char offsets
                        // to the token before returning it
                        let token = Token {
                            word: token.word,
                            word_index: token.word_index + self.word_offset,
                            char_index: token.char_index + self.char_offset,
                        };
                        // if this is the last iteration on this text
                        // we must save the offsets for next texts
                        if current.peek().is_none() {
                            let hard_space = SeparatorCategory::Hard.to_usize();
                            self.word_offset = token.word_index + hard_space;
                            self.char_offset = token.char_index + hard_space;
                        }
                        Some(token)
                    },
                    None => {
                        // no more words in this text we must
                        // start tokenizing the next text
                        self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
                        self.next()
                    },
                }
            },
            // no more texts available
            None => None,
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;