mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 08:48:32 +08:00
Make the query tokenizer a real Iterator
This commit is contained in:
parent
f55f4cb02a
commit
ce86a43779
10
src/lib.rs
10
src/lib.rs
@ -1,4 +1,4 @@
|
||||
mod query;
|
||||
mod query_tokens;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
@ -14,7 +14,7 @@ use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
|
||||
use once_cell::sync::OnceCell;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use self::query::{QueryWord, alphanumeric_quoted_tokens};
|
||||
use self::query_tokens::{QueryTokens, QueryToken};
|
||||
|
||||
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
|
||||
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
|
||||
@ -59,13 +59,13 @@ impl Index {
|
||||
let lev1 = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
|
||||
let lev2 = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
|
||||
|
||||
let words: Vec<_> = alphanumeric_quoted_tokens(query).collect();
|
||||
let words: Vec<_> = QueryTokens::new(query).collect();
|
||||
let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let number_of_words = words.len();
|
||||
let dfas = words.into_iter().enumerate().map(|(i, word)| {
|
||||
let (word, quoted) = match word {
|
||||
QueryWord::Free(word) => (word.cow_to_lowercase(), false),
|
||||
QueryWord::Quoted(word) => (Cow::Borrowed(word), true),
|
||||
QueryToken::Free(word) => (word.cow_to_lowercase(), false),
|
||||
QueryToken::Quoted(word) => (Cow::Borrowed(word), true),
|
||||
};
|
||||
let is_last = i + 1 == number_of_words;
|
||||
let is_prefix = is_last && !ends_with_whitespace && !quoted;
|
||||
|
145
src/query.rs
145
src/query.rs
@ -1,145 +0,0 @@
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum QueryWord<'a> {
|
||||
Free(&'a str),
|
||||
Quoted(&'a str),
|
||||
}
|
||||
|
||||
pub fn alphanumeric_quoted_tokens(string: &str) -> impl Iterator<Item = QueryWord> {
|
||||
use QueryWord::{Quoted, Free};
|
||||
|
||||
enum State {
|
||||
Free(usize),
|
||||
Quoted(usize),
|
||||
Fused,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn is_quoted(&self) -> bool {
|
||||
match self { State::Quoted(_) => true, _ => false }
|
||||
}
|
||||
|
||||
fn replace_by(&mut self, state: State) -> State {
|
||||
std::mem::replace(self, state)
|
||||
}
|
||||
}
|
||||
|
||||
let mut state = State::Free(0);
|
||||
let mut string_chars = string.char_indices();
|
||||
std::iter::from_fn(move || {
|
||||
loop {
|
||||
let (i, afteri, c) = match string_chars.next() {
|
||||
Some((i, c)) => (i, i + c.len_utf8(), c),
|
||||
None => return match state.replace_by(State::Fused) {
|
||||
State::Free(s) => if !string[s..].is_empty() {
|
||||
Some(Free(&string[s..]))
|
||||
} else {
|
||||
None
|
||||
},
|
||||
State::Quoted(s) => Some(Quoted(&string[s..])),
|
||||
State::Fused => None,
|
||||
},
|
||||
};
|
||||
|
||||
if c == '"' {
|
||||
match state.replace_by(State::Free(afteri)) {
|
||||
State::Quoted(s) => return Some(Quoted(&string[s..i])),
|
||||
State::Free(s) => {
|
||||
state = State::Quoted(afteri);
|
||||
if i > s { return Some(Free(&string[s..i])) }
|
||||
},
|
||||
State::Fused => return None,
|
||||
}
|
||||
}
|
||||
else if !state.is_quoted() && !c.is_alphanumeric() {
|
||||
match state.replace_by(State::Free(afteri)) {
|
||||
State::Free(s) if i > s => return Some(Free(&string[s..i])),
|
||||
_ => state = State::Free(afteri),
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn one_quoted_string() {
|
||||
use QueryWord::Quoted;
|
||||
|
||||
let mut iter = alphanumeric_quoted_tokens("\"hello\"");
|
||||
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_pending_quoted_string() {
|
||||
use QueryWord::Quoted;
|
||||
|
||||
let mut iter = alphanumeric_quoted_tokens("\"hello");
|
||||
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_non_quoted_string() {
|
||||
use QueryWord::Free;
|
||||
|
||||
let mut iter = alphanumeric_quoted_tokens("hello");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quoted_directly_followed_by_free_strings() {
|
||||
use QueryWord::{Quoted, Free};
|
||||
|
||||
let mut iter = alphanumeric_quoted_tokens("\"hello\"world");
|
||||
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||
assert_eq!(iter.next(), Some(Free("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn free_directly_followed_by_quoted_strings() {
|
||||
use QueryWord::{Quoted, Free};
|
||||
|
||||
let mut iter = alphanumeric_quoted_tokens("hello\"world\"");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn free_followed_by_quoted_strings() {
|
||||
use QueryWord::{Quoted, Free};
|
||||
|
||||
let mut iter = alphanumeric_quoted_tokens("hello \"world\"");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_spaces_separated_strings() {
|
||||
use QueryWord::Free;
|
||||
|
||||
let mut iter = alphanumeric_quoted_tokens("hello world ");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Free("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_interleaved_quoted_free_strings() {
|
||||
use QueryWord::{Quoted, Free};
|
||||
|
||||
let mut iter = alphanumeric_quoted_tokens("hello \"world\" coucou \"monde\"");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||
assert_eq!(iter.next(), Some(Free("coucou")));
|
||||
assert_eq!(iter.next(), Some(Quoted("monde")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
}
|
163
src/query_tokens.rs
Normal file
163
src/query_tokens.rs
Normal file
@ -0,0 +1,163 @@
|
||||
use std::{mem, str};
|
||||
|
||||
use QueryToken::{Quoted, Free};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum QueryToken<'a> {
|
||||
Free(&'a str),
|
||||
Quoted(&'a str),
|
||||
}
|
||||
|
||||
enum State {
|
||||
Free(usize),
|
||||
Quoted(usize),
|
||||
Fused,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn is_quoted(&self) -> bool {
|
||||
match self { State::Quoted(_) => true, _ => false }
|
||||
}
|
||||
|
||||
fn replace_by(&mut self, state: State) -> State {
|
||||
mem::replace(self, state)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryTokens<'a> {
|
||||
state: State,
|
||||
string: &'a str,
|
||||
string_chars: str::CharIndices<'a>,
|
||||
}
|
||||
|
||||
impl<'a> QueryTokens<'a> {
|
||||
pub fn new(query: &'a str) -> QueryTokens<'a> {
|
||||
QueryTokens {
|
||||
state: State::Free(0),
|
||||
string: query,
|
||||
string_chars: query.char_indices(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for QueryTokens<'a> {
|
||||
type Item = QueryToken<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
let (i, afteri, c) = match self.string_chars.next() {
|
||||
Some((i, c)) => (i, i + c.len_utf8(), c),
|
||||
None => return match self.state.replace_by(State::Fused) {
|
||||
State::Free(s) => if !self.string[s..].is_empty() {
|
||||
Some(Free(&self.string[s..]))
|
||||
} else {
|
||||
None
|
||||
},
|
||||
State::Quoted(s) => Some(Quoted(&self.string[s..])),
|
||||
State::Fused => None,
|
||||
},
|
||||
};
|
||||
|
||||
if c == '"' {
|
||||
match self.state.replace_by(State::Free(afteri)) {
|
||||
State::Quoted(s) => return Some(Quoted(&self.string[s..i])),
|
||||
State::Free(s) => {
|
||||
self.state = State::Quoted(afteri);
|
||||
if i > s { return Some(Free(&self.string[s..i])) }
|
||||
},
|
||||
State::Fused => return None,
|
||||
}
|
||||
}
|
||||
else if !self.state.is_quoted() && !c.is_alphanumeric() {
|
||||
match self.state.replace_by(State::Free(afteri)) {
|
||||
State::Free(s) if i > s => return Some(Free(&self.string[s..i])),
|
||||
_ => self.state = State::Free(afteri),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn one_quoted_string() {
|
||||
use QueryToken::Quoted;
|
||||
|
||||
let mut iter = QueryTokens::new("\"hello\"");
|
||||
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_pending_quoted_string() {
|
||||
use QueryToken::Quoted;
|
||||
|
||||
let mut iter = QueryTokens::new("\"hello");
|
||||
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_non_quoted_string() {
|
||||
use QueryToken::Free;
|
||||
|
||||
let mut iter = QueryTokens::new("hello");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quoted_directly_followed_by_free_strings() {
|
||||
use QueryToken::{Quoted, Free};
|
||||
|
||||
let mut iter = QueryTokens::new("\"hello\"world");
|
||||
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||
assert_eq!(iter.next(), Some(Free("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn free_directly_followed_by_quoted_strings() {
|
||||
use QueryToken::{Quoted, Free};
|
||||
|
||||
let mut iter = QueryTokens::new("hello\"world\"");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn free_followed_by_quoted_strings() {
|
||||
use QueryToken::{Quoted, Free};
|
||||
|
||||
let mut iter = QueryTokens::new("hello \"world\"");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_spaces_separated_strings() {
|
||||
use QueryToken::Free;
|
||||
|
||||
let mut iter = QueryTokens::new("hello world ");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Free("world")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_interleaved_quoted_free_strings() {
|
||||
use QueryToken::{Quoted, Free};
|
||||
|
||||
let mut iter = QueryTokens::new("hello \"world\" coucou \"monde\"");
|
||||
assert_eq!(iter.next(), Some(Free("hello")));
|
||||
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||
assert_eq!(iter.next(), Some(Free("coucou")));
|
||||
assert_eq!(iter.next(), Some(Quoted("monde")));
|
||||
assert_eq!(iter.next(), None);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user