mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-02-20 01:27:52 +08:00
feat: Make the Tokenizer able to support tokenizing sequences
This commit is contained in:
parent
c6bb2b6f9c
commit
abf7191eec
@ -1,3 +1,4 @@
|
|||||||
|
use std::iter::Peekable;
|
||||||
use slice_group_by::StrGroupBy;
|
use slice_group_by::StrGroupBy;
|
||||||
use self::SeparatorCategory::*;
|
use self::SeparatorCategory::*;
|
||||||
|
|
||||||
@ -151,6 +152,71 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct SeqTokenizer<'a, I>
|
||||||
|
where I: Iterator<Item=&'a str>,
|
||||||
|
{
|
||||||
|
inner: I,
|
||||||
|
current: Option<Peekable<Tokenizer<'a>>>,
|
||||||
|
word_offset: usize,
|
||||||
|
char_offset: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, I> SeqTokenizer<'a, I>
|
||||||
|
where I: Iterator<Item=&'a str>,
|
||||||
|
{
|
||||||
|
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
|
||||||
|
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
|
||||||
|
SeqTokenizer {
|
||||||
|
inner: iter,
|
||||||
|
current: current,
|
||||||
|
word_offset: 0,
|
||||||
|
char_offset: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, I> Iterator for SeqTokenizer<'a, I>
|
||||||
|
where I: Iterator<Item=&'a str>,
|
||||||
|
{
|
||||||
|
type Item = Token<'a>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
match &mut self.current {
|
||||||
|
Some(current) => {
|
||||||
|
match current.next() {
|
||||||
|
Some(token) => {
|
||||||
|
// we must apply the word and char offsets
|
||||||
|
// to the token before returning it
|
||||||
|
let token = Token {
|
||||||
|
word: token.word,
|
||||||
|
word_index: token.word_index + self.word_offset,
|
||||||
|
char_index: token.char_index + self.char_offset,
|
||||||
|
};
|
||||||
|
|
||||||
|
// if this is the last iteration on this text
|
||||||
|
// we must save the offsets for next texts
|
||||||
|
if current.peek().is_none() {
|
||||||
|
let hard_space = SeparatorCategory::Hard.to_usize();
|
||||||
|
self.word_offset = token.word_index + hard_space;
|
||||||
|
self.char_offset = token.char_index + hard_space;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(token)
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
// no more words in this text we must
|
||||||
|
// start tokenizing the next text
|
||||||
|
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
|
||||||
|
self.next()
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// no more texts available
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user