From ce86a43779f7376b152f0dcb2a6d535c1d4c291e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 5 Jun 2020 09:48:46 +0200 Subject: [PATCH] Make the query tokenizer a real Iterator --- src/lib.rs | 10 +-- src/query.rs | 145 --------------------------------------- src/query_tokens.rs | 163 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+), 150 deletions(-) delete mode 100644 src/query.rs create mode 100644 src/query_tokens.rs diff --git a/src/lib.rs b/src/lib.rs index dab363d64..8ea767b28 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -mod query; +mod query_tokens; use std::borrow::Cow; use std::collections::HashMap; @@ -14,7 +14,7 @@ use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use once_cell::sync::OnceCell; use roaring::RoaringBitmap; -use self::query::{QueryWord, alphanumeric_quoted_tokens}; +use self::query_tokens::{QueryTokens, QueryToken}; static LEVDIST0: OnceCell = OnceCell::new(); static LEVDIST1: OnceCell = OnceCell::new(); @@ -59,13 +59,13 @@ impl Index { let lev1 = LEVDIST1.get_or_init(|| LevBuilder::new(1, true)); let lev2 = LEVDIST2.get_or_init(|| LevBuilder::new(2, true)); - let words: Vec<_> = alphanumeric_quoted_tokens(query).collect(); + let words: Vec<_> = QueryTokens::new(query).collect(); let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace); let number_of_words = words.len(); let dfas = words.into_iter().enumerate().map(|(i, word)| { let (word, quoted) = match word { - QueryWord::Free(word) => (word.cow_to_lowercase(), false), - QueryWord::Quoted(word) => (Cow::Borrowed(word), true), + QueryToken::Free(word) => (word.cow_to_lowercase(), false), + QueryToken::Quoted(word) => (Cow::Borrowed(word), true), }; let is_last = i + 1 == number_of_words; let is_prefix = is_last && !ends_with_whitespace && !quoted; diff --git a/src/query.rs b/src/query.rs deleted file mode 100644 index c3049590c..000000000 --- a/src/query.rs +++ /dev/null @@ -1,145 +0,0 @@ -#[derive(Debug, PartialEq, Eq)] -pub enum QueryWord<'a> { - Free(&'a str), - Quoted(&'a str), -} - -pub fn alphanumeric_quoted_tokens(string: &str) -> impl Iterator { - use QueryWord::{Quoted, Free}; - - enum State { - Free(usize), - Quoted(usize), - Fused, - } - - impl State { - fn is_quoted(&self) -> bool { - match self { State::Quoted(_) => true, _ => false } - } - - fn replace_by(&mut self, state: State) -> State { - std::mem::replace(self, state) - } - } - - let mut state = State::Free(0); - let mut string_chars = string.char_indices(); - std::iter::from_fn(move || { - loop { - let (i, afteri, c) = match string_chars.next() { - Some((i, c)) => (i, i + c.len_utf8(), c), - None => return match state.replace_by(State::Fused) { - State::Free(s) => if !string[s..].is_empty() { - Some(Free(&string[s..])) - } else { - None - }, - State::Quoted(s) => Some(Quoted(&string[s..])), - State::Fused => None, - }, - }; - - if c == '"' { - match state.replace_by(State::Free(afteri)) { - State::Quoted(s) => return Some(Quoted(&string[s..i])), - State::Free(s) => { - state = State::Quoted(afteri); - if i > s { return Some(Free(&string[s..i])) } - }, - State::Fused => return None, - } - } - else if !state.is_quoted() && !c.is_alphanumeric() { - match state.replace_by(State::Free(afteri)) { - State::Free(s) if i > s => return Some(Free(&string[s..i])), - _ => state = State::Free(afteri), - } - } - } - }) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn one_quoted_string() { - use QueryWord::Quoted; - - let mut iter = alphanumeric_quoted_tokens("\"hello\""); - assert_eq!(iter.next(), Some(Quoted("hello"))); - assert_eq!(iter.next(), None); - } - - #[test] - fn one_pending_quoted_string() { - use QueryWord::Quoted; - - let mut iter = alphanumeric_quoted_tokens("\"hello"); - assert_eq!(iter.next(), Some(Quoted("hello"))); - assert_eq!(iter.next(), None); - } - - #[test] - fn one_non_quoted_string() { - use QueryWord::Free; - - let mut iter = alphanumeric_quoted_tokens("hello"); - assert_eq!(iter.next(), Some(Free("hello"))); - assert_eq!(iter.next(), None); - } - - #[test] - fn quoted_directly_followed_by_free_strings() { - use QueryWord::{Quoted, Free}; - - let mut iter = alphanumeric_quoted_tokens("\"hello\"world"); - assert_eq!(iter.next(), Some(Quoted("hello"))); - assert_eq!(iter.next(), Some(Free("world"))); - assert_eq!(iter.next(), None); - } - - #[test] - fn free_directly_followed_by_quoted_strings() { - use QueryWord::{Quoted, Free}; - - let mut iter = alphanumeric_quoted_tokens("hello\"world\""); - assert_eq!(iter.next(), Some(Free("hello"))); - assert_eq!(iter.next(), Some(Quoted("world"))); - assert_eq!(iter.next(), None); - } - - #[test] - fn free_followed_by_quoted_strings() { - use QueryWord::{Quoted, Free}; - - let mut iter = alphanumeric_quoted_tokens("hello \"world\""); - assert_eq!(iter.next(), Some(Free("hello"))); - assert_eq!(iter.next(), Some(Quoted("world"))); - assert_eq!(iter.next(), None); - } - - #[test] - fn multiple_spaces_separated_strings() { - use QueryWord::Free; - - let mut iter = alphanumeric_quoted_tokens("hello world "); - assert_eq!(iter.next(), Some(Free("hello"))); - assert_eq!(iter.next(), Some(Free("world"))); - assert_eq!(iter.next(), None); - } - - #[test] - fn multi_interleaved_quoted_free_strings() { - use QueryWord::{Quoted, Free}; - - let mut iter = alphanumeric_quoted_tokens("hello \"world\" coucou \"monde\""); - assert_eq!(iter.next(), Some(Free("hello"))); - assert_eq!(iter.next(), Some(Quoted("world"))); - assert_eq!(iter.next(), Some(Free("coucou"))); - assert_eq!(iter.next(), Some(Quoted("monde"))); - assert_eq!(iter.next(), None); - } -} diff --git a/src/query_tokens.rs b/src/query_tokens.rs new file mode 100644 index 000000000..f23ad7d74 --- /dev/null +++ b/src/query_tokens.rs @@ -0,0 +1,163 @@ +use std::{mem, str}; + +use QueryToken::{Quoted, Free}; + +#[derive(Debug, PartialEq, Eq)] +pub enum QueryToken<'a> { + Free(&'a str), + Quoted(&'a str), +} + +enum State { + Free(usize), + Quoted(usize), + Fused, +} + +impl State { + fn is_quoted(&self) -> bool { + match self { State::Quoted(_) => true, _ => false } + } + + fn replace_by(&mut self, state: State) -> State { + mem::replace(self, state) + } +} + +pub struct QueryTokens<'a> { + state: State, + string: &'a str, + string_chars: str::CharIndices<'a>, +} + +impl<'a> QueryTokens<'a> { + pub fn new(query: &'a str) -> QueryTokens<'a> { + QueryTokens { + state: State::Free(0), + string: query, + string_chars: query.char_indices(), + } + } +} + +impl<'a> Iterator for QueryTokens<'a> { + type Item = QueryToken<'a>; + + fn next(&mut self) -> Option { + loop { + let (i, afteri, c) = match self.string_chars.next() { + Some((i, c)) => (i, i + c.len_utf8(), c), + None => return match self.state.replace_by(State::Fused) { + State::Free(s) => if !self.string[s..].is_empty() { + Some(Free(&self.string[s..])) + } else { + None + }, + State::Quoted(s) => Some(Quoted(&self.string[s..])), + State::Fused => None, + }, + }; + + if c == '"' { + match self.state.replace_by(State::Free(afteri)) { + State::Quoted(s) => return Some(Quoted(&self.string[s..i])), + State::Free(s) => { + self.state = State::Quoted(afteri); + if i > s { return Some(Free(&self.string[s..i])) } + }, + State::Fused => return None, + } + } + else if !self.state.is_quoted() && !c.is_alphanumeric() { + match self.state.replace_by(State::Free(afteri)) { + State::Free(s) if i > s => return Some(Free(&self.string[s..i])), + _ => self.state = State::Free(afteri), + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn one_quoted_string() { + use QueryToken::Quoted; + + let mut iter = QueryTokens::new("\"hello\""); + assert_eq!(iter.next(), Some(Quoted("hello"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn one_pending_quoted_string() { + use QueryToken::Quoted; + + let mut iter = QueryTokens::new("\"hello"); + assert_eq!(iter.next(), Some(Quoted("hello"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn one_non_quoted_string() { + use QueryToken::Free; + + let mut iter = QueryTokens::new("hello"); + assert_eq!(iter.next(), Some(Free("hello"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn quoted_directly_followed_by_free_strings() { + use QueryToken::{Quoted, Free}; + + let mut iter = QueryTokens::new("\"hello\"world"); + assert_eq!(iter.next(), Some(Quoted("hello"))); + assert_eq!(iter.next(), Some(Free("world"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn free_directly_followed_by_quoted_strings() { + use QueryToken::{Quoted, Free}; + + let mut iter = QueryTokens::new("hello\"world\""); + assert_eq!(iter.next(), Some(Free("hello"))); + assert_eq!(iter.next(), Some(Quoted("world"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn free_followed_by_quoted_strings() { + use QueryToken::{Quoted, Free}; + + let mut iter = QueryTokens::new("hello \"world\""); + assert_eq!(iter.next(), Some(Free("hello"))); + assert_eq!(iter.next(), Some(Quoted("world"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn multiple_spaces_separated_strings() { + use QueryToken::Free; + + let mut iter = QueryTokens::new("hello world "); + assert_eq!(iter.next(), Some(Free("hello"))); + assert_eq!(iter.next(), Some(Free("world"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn multi_interleaved_quoted_free_strings() { + use QueryToken::{Quoted, Free}; + + let mut iter = QueryTokens::new("hello \"world\" coucou \"monde\""); + assert_eq!(iter.next(), Some(Free("hello"))); + assert_eq!(iter.next(), Some(Quoted("world"))); + assert_eq!(iter.next(), Some(Free("coucou"))); + assert_eq!(iter.next(), Some(Quoted("monde"))); + assert_eq!(iter.next(), None); + } +}