mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-25 19:45:05 +08:00
feat: introduce a better simple word lexer
This commit is contained in:
parent
10af32bf4b
commit
b84be67aa2
@ -4,6 +4,7 @@ pub mod rank;
|
||||
pub mod metadata;
|
||||
pub mod vec_read_only;
|
||||
pub mod automaton;
|
||||
pub mod tokenizer;
|
||||
|
||||
pub use self::metadata::{Metadata, MetadataBuilder};
|
||||
pub use self::rank::RankedStream;
|
||||
|
137
raptor/src/tokenizer/mod.rs
Normal file
137
raptor/src/tokenizer/mod.rs
Normal file
@ -0,0 +1,137 @@
|
||||
use std::mem;
|
||||
use self::Separator::*;
|
||||
|
||||
pub struct Tokenizer<'a> {
|
||||
inner: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> {
|
||||
pub fn new(string: &str) -> Tokenizer {
|
||||
Tokenizer { inner: string }
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Tokens {
|
||||
Tokens::new(self.inner)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Tokens<'a> {
|
||||
index: usize,
|
||||
inner: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> Tokens<'a> {
|
||||
fn new(string: &str) -> Tokens {
|
||||
Tokens {
|
||||
index: 0,
|
||||
inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-'][..]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum Separator {
|
||||
Short,
|
||||
Long,
|
||||
}
|
||||
|
||||
impl Separator {
|
||||
fn add(self, add: Separator) -> Separator {
|
||||
match (self, add) {
|
||||
(_, Long) => Long,
|
||||
(Short, Short) => Short,
|
||||
(Long, Short) => Long,
|
||||
}
|
||||
}
|
||||
|
||||
fn to_usize(self) -> usize {
|
||||
match self {
|
||||
Short => 1,
|
||||
Long => 8,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Tokens<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut start_word = None;
|
||||
let mut distance = None;
|
||||
|
||||
for (i, c) in self.inner.char_indices() {
|
||||
let separator = match c {
|
||||
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
|
||||
' ' => Some(Short),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
match separator {
|
||||
Some(dist) => {
|
||||
if let Some(start_word) = start_word {
|
||||
let (word, tail) = self.inner.split_at(i);
|
||||
|
||||
self.inner = tail;
|
||||
self.index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
|
||||
let word = &word[start_word..];
|
||||
return Some((self.index, word))
|
||||
}
|
||||
distance = Some(distance.map(|s| s.add(dist)).unwrap_or(dist));
|
||||
},
|
||||
None => { start_word.get_or_insert(i); },
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(start_word) = start_word {
|
||||
let word = mem::replace(&mut self.inner, "");
|
||||
self.index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||
|
||||
let word = &word[start_word..];
|
||||
return Some((self.index, word))
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn easy() {
|
||||
let tokenizer = Tokenizer::new("salut");
|
||||
let mut tokens = tokenizer.iter();
|
||||
|
||||
assert_eq!(tokens.next(), Some((0, "salut")));
|
||||
assert_eq!(tokens.next(), None);
|
||||
|
||||
let tokenizer = Tokenizer::new("yo ");
|
||||
let mut tokens = tokenizer.iter();
|
||||
|
||||
assert_eq!(tokens.next(), Some((0, "yo")));
|
||||
assert_eq!(tokens.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hard() {
|
||||
let tokenizer = Tokenizer::new(" .? yo lolo. aïe");
|
||||
let mut tokens = tokenizer.iter();
|
||||
|
||||
assert_eq!(tokens.next(), Some((0, "yo")));
|
||||
assert_eq!(tokens.next(), Some((1, "lolo")));
|
||||
assert_eq!(tokens.next(), Some((9, "aïe")));
|
||||
assert_eq!(tokens.next(), None);
|
||||
|
||||
let tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
||||
let mut tokens = tokenizer.iter();
|
||||
|
||||
assert_eq!(tokens.next(), Some((0, "yo")));
|
||||
assert_eq!(tokens.next(), Some((8, "lolo")));
|
||||
assert_eq!(tokens.next(), Some((16, "wtf")));
|
||||
assert_eq!(tokens.next(), Some((24, "lol")));
|
||||
assert_eq!(tokens.next(), Some((32, "aïe")));
|
||||
assert_eq!(tokens.next(), None);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user