diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 25b483421..f7d68ef08 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -12,9 +12,9 @@ use heed::types::*; use oxidized_mtbl::{Reader, ReaderOptions, Writer, Merger, MergerOptions}; use rayon::prelude::*; use roaring::RoaringBitmap; +use slice_group_by::StrGroupBy; use structopt::StructOpt; -use mega_mini_indexer::alphanumeric_tokens; use mega_mini_indexer::{FastMap4, SmallVec32, Index, DocumentId}; #[cfg(target_os = "linux")] @@ -23,6 +23,11 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; static ID_GENERATOR: AtomicUsize = AtomicUsize::new(0); // AtomicU32 ? +pub fn simple_alphanumeric_tokens(string: &str) -> impl Iterator { + let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); + string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) +} + #[derive(Debug, StructOpt)] #[structopt(name = "mm-indexer", about = "The indexer side of the MMI project.")] struct Opt { @@ -186,7 +191,7 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { let document_id = DocumentId::try_from(document_id).context("Generated id is too big")?; for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { - for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { + for (_pos, word) in simple_alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { if !word.is_empty() && word.len() < 500 { // LMDB limits let word = word.cow_to_lowercase(); postings_ids.entry(SmallVec32::from(word.as_bytes())) diff --git a/src/lib.rs b/src/lib.rs index c8e076b58..d4ea13f3d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,6 @@ +mod query; + +use std::borrow::Cow; use std::collections::HashMap; use std::hash::BuildHasherDefault; use std::time::Instant; @@ -10,7 +13,8 @@ use heed::{PolyDatabase, Database}; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use once_cell::sync::OnceCell; use roaring::RoaringBitmap; -use slice_group_by::StrGroupBy; + +use self::query::{QueryWord, alphanumeric_quoted_tokens}; static LEVDIST0: OnceCell = OnceCell::new(); static LEVDIST1: OnceCell = OnceCell::new(); @@ -22,11 +26,6 @@ pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>; pub type BEU32 = heed::zerocopy::U32; pub type DocumentId = u32; -pub fn alphanumeric_tokens(string: &str) -> impl Iterator { - let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); - string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) -} - #[derive(Clone)] pub struct Index { pub main: PolyDatabase, @@ -60,17 +59,20 @@ impl Index { let lev1 = LEVDIST1.get_or_init(|| LevBuilder::new(1, true)); let lev2 = LEVDIST2.get_or_init(|| LevBuilder::new(2, true)); - let words: Vec<_> = alphanumeric_tokens(query).collect(); + let words: Vec<_> = alphanumeric_quoted_tokens(query).collect(); let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace); let number_of_words = words.len(); let dfas = words.into_iter().enumerate().map(|(i, word)| { - let word = word.cow_to_lowercase(); + let (word, quoted) = match word { + QueryWord::Free(word) => (word.cow_to_lowercase(), false), + QueryWord::Quoted(word) => (Cow::Borrowed(word), true), + }; let is_last = i + 1 == number_of_words; - let is_prefix = is_last && !ends_with_whitespace; + let is_prefix = is_last && !ends_with_whitespace && !quoted; let dfa = match word.len() { - 0..=4 => if is_prefix { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) }, - 5..=8 => if is_prefix { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) }, - _ => if is_prefix { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) }, + 0..=4 => if is_prefix { lev0.build_prefix_dfa(&word) } else if quoted { lev0.build_dfa(&word) } else { lev0.build_dfa(&word) }, + 5..=8 => if is_prefix { lev1.build_prefix_dfa(&word) } else if quoted { lev0.build_dfa(&word) } else { lev1.build_dfa(&word) }, + _ => if is_prefix { lev2.build_prefix_dfa(&word) } else if quoted { lev0.build_dfa(&word) } else { lev2.build_dfa(&word) }, }; (word, dfa) }); diff --git a/src/query.rs b/src/query.rs new file mode 100644 index 000000000..c3049590c --- /dev/null +++ b/src/query.rs @@ -0,0 +1,145 @@ +#[derive(Debug, PartialEq, Eq)] +pub enum QueryWord<'a> { + Free(&'a str), + Quoted(&'a str), +} + +pub fn alphanumeric_quoted_tokens(string: &str) -> impl Iterator { + use QueryWord::{Quoted, Free}; + + enum State { + Free(usize), + Quoted(usize), + Fused, + } + + impl State { + fn is_quoted(&self) -> bool { + match self { State::Quoted(_) => true, _ => false } + } + + fn replace_by(&mut self, state: State) -> State { + std::mem::replace(self, state) + } + } + + let mut state = State::Free(0); + let mut string_chars = string.char_indices(); + std::iter::from_fn(move || { + loop { + let (i, afteri, c) = match string_chars.next() { + Some((i, c)) => (i, i + c.len_utf8(), c), + None => return match state.replace_by(State::Fused) { + State::Free(s) => if !string[s..].is_empty() { + Some(Free(&string[s..])) + } else { + None + }, + State::Quoted(s) => Some(Quoted(&string[s..])), + State::Fused => None, + }, + }; + + if c == '"' { + match state.replace_by(State::Free(afteri)) { + State::Quoted(s) => return Some(Quoted(&string[s..i])), + State::Free(s) => { + state = State::Quoted(afteri); + if i > s { return Some(Free(&string[s..i])) } + }, + State::Fused => return None, + } + } + else if !state.is_quoted() && !c.is_alphanumeric() { + match state.replace_by(State::Free(afteri)) { + State::Free(s) if i > s => return Some(Free(&string[s..i])), + _ => state = State::Free(afteri), + } + } + } + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn one_quoted_string() { + use QueryWord::Quoted; + + let mut iter = alphanumeric_quoted_tokens("\"hello\""); + assert_eq!(iter.next(), Some(Quoted("hello"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn one_pending_quoted_string() { + use QueryWord::Quoted; + + let mut iter = alphanumeric_quoted_tokens("\"hello"); + assert_eq!(iter.next(), Some(Quoted("hello"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn one_non_quoted_string() { + use QueryWord::Free; + + let mut iter = alphanumeric_quoted_tokens("hello"); + assert_eq!(iter.next(), Some(Free("hello"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn quoted_directly_followed_by_free_strings() { + use QueryWord::{Quoted, Free}; + + let mut iter = alphanumeric_quoted_tokens("\"hello\"world"); + assert_eq!(iter.next(), Some(Quoted("hello"))); + assert_eq!(iter.next(), Some(Free("world"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn free_directly_followed_by_quoted_strings() { + use QueryWord::{Quoted, Free}; + + let mut iter = alphanumeric_quoted_tokens("hello\"world\""); + assert_eq!(iter.next(), Some(Free("hello"))); + assert_eq!(iter.next(), Some(Quoted("world"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn free_followed_by_quoted_strings() { + use QueryWord::{Quoted, Free}; + + let mut iter = alphanumeric_quoted_tokens("hello \"world\""); + assert_eq!(iter.next(), Some(Free("hello"))); + assert_eq!(iter.next(), Some(Quoted("world"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn multiple_spaces_separated_strings() { + use QueryWord::Free; + + let mut iter = alphanumeric_quoted_tokens("hello world "); + assert_eq!(iter.next(), Some(Free("hello"))); + assert_eq!(iter.next(), Some(Free("world"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn multi_interleaved_quoted_free_strings() { + use QueryWord::{Quoted, Free}; + + let mut iter = alphanumeric_quoted_tokens("hello \"world\" coucou \"monde\""); + assert_eq!(iter.next(), Some(Free("hello"))); + assert_eq!(iter.next(), Some(Quoted("world"))); + assert_eq!(iter.next(), Some(Free("coucou"))); + assert_eq!(iter.next(), Some(Quoted("monde"))); + assert_eq!(iter.next(), None); + } +}