diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 037a7788c..29d2e61ef 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" [dependencies] byteorder = "1.3.1" +crossbeam-channel = "0.3.9" deunicode = "1.0.0" hashbrown = "0.2.2" lazy_static = "1.2.0" @@ -14,7 +15,7 @@ meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } rayon = "1.0.3" sdset = "0.3.2" serde = { version = "1.0.88", features = ["derive"] } -slice-group-by = "0.2.4" +slice-group-by = "0.2.6" zerocopy = "0.2.2" [dependencies.fst] diff --git a/meilidb-core/src/criterion/sum_of_typos.rs b/meilidb-core/src/criterion/sum_of_typos.rs index d5cd75f08..6736e6caa 100644 --- a/meilidb-core/src/criterion/sum_of_typos.rs +++ b/meilidb-core/src/criterion/sum_of_typos.rs @@ -21,7 +21,7 @@ fn custom_log10(n: u8) -> f32 { #[inline] fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize { - let mut number_words = 0; + let mut number_words: usize = 0; let mut sum_typos = 0.0; let mut index = 0; diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 0976fbde8..0a7844292 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -1,22 +1,24 @@ +#![feature(checked_duration_since)] + #[cfg(test)] #[macro_use] extern crate assert_matches; mod automaton; mod distinct_map; mod query_builder; +mod query_enhancer; +mod raw_document; mod reordered_attrs; mod store; pub mod criterion; -use std::fmt; -use std::sync::Arc; - -use sdset::SetBuf; use serde::{Serialize, Deserialize}; -use slice_group_by::GroupBy; use zerocopy::{AsBytes, FromBytes}; +use self::raw_document::raw_documents_from; + pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str}; +pub use self::raw_document::RawDocument; pub use self::store::Store; /// Represent an internally generated document unique identifier. @@ -130,132 +132,6 @@ impl Document { } } -#[derive(Clone)] -pub struct RawDocument { - pub id: DocumentId, - pub matches: SharedMatches, - pub highlights: Vec, -} - -impl RawDocument { - fn new(id: DocumentId, matches: SharedMatches, highlights: Vec) -> RawDocument { - RawDocument { id, matches, highlights } - } - - pub fn query_index(&self) -> &[u32] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } - } - - pub fn distance(&self) -> &[u8] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } - } - - pub fn attribute(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } - } - - pub fn word_index(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } - } - - pub fn is_exact(&self) -> &[bool] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } - } -} - -impl fmt::Debug for RawDocument { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("RawDocument") - .field("id", &self.id) - .field("query_index", &self.query_index()) - .field("distance", &self.distance()) - .field("attribute", &self.attribute()) - .field("word_index", &self.word_index()) - .field("is_exact", &self.is_exact()) - .finish() - } -} - -fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec { - let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); - let mut matches2 = Matches::with_capacity(matches.len()); - - for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) { - let document_id = group[0].0; - let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); - let end = start + group.len(); - - let highlights = group.iter().map(|(_, _, h)| *h).collect(); - docs_ranges.push((document_id, Range { start, end }, highlights)); - - matches2.extend_from_slice(group); - } - - let matches = Arc::new(matches2); - docs_ranges.into_iter().map(|(i, range, highlights)| { - let matches = SharedMatches { range, matches: matches.clone() }; - RawDocument::new(i, matches, highlights) - }).collect() -} - -#[derive(Debug, Copy, Clone)] -struct Range { - start: usize, - end: usize, -} - -#[derive(Clone)] -pub struct SharedMatches { - range: Range, - matches: Arc, -} - -#[derive(Clone)] -struct Matches { - query_index: Vec, - distance: Vec, - attribute: Vec, - word_index: Vec, - is_exact: Vec, -} - -impl Matches { - fn with_capacity(cap: usize) -> Matches { - Matches { - query_index: Vec::with_capacity(cap), - distance: Vec::with_capacity(cap), - attribute: Vec::with_capacity(cap), - word_index: Vec::with_capacity(cap), - is_exact: Vec::with_capacity(cap), - } - } - - fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) { - for (_, match_, _) in matches { - self.query_index.push(match_.query_index); - self.distance.push(match_.distance); - self.attribute.push(match_.attribute); - self.word_index.push(match_.word_index); - self.is_exact.push(match_.is_exact); - } - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 43da389a8..5847f9f1b 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -1,59 +1,77 @@ use std::hash::Hash; use std::ops::Range; use std::rc::Rc; -use std::time::Instant; -use std::{cmp, mem}; +use std::time::{Instant, Duration}; +use std::{iter, mem, cmp, cmp::Reverse}; use fst::{Streamer, IntoStreamer}; use hashbrown::HashMap; -use log::info; +use levenshtein_automata::DFA; +use log::{info, error}; use meilidb_tokenizer::{is_cjk, split_query_string}; use rayon::slice::ParallelSliceMut; +use rayon::iter::{ParallelIterator, ParallelBridge}; use sdset::SetBuf; -use slice_group_by::GroupByMut; -use levenshtein_automata::DFA; +use slice_group_by::{GroupBy, GroupByMut}; use crate::automaton::{build_dfa, build_prefix_dfa}; -use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; -use crate::raw_documents_from_matches; +use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; +use crate::query_enhancer::{QueryEnhancerBuilder, QueryEnhancer}; +use crate::raw_documents_from; use crate::reordered_attrs::ReorderedAttrs; use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document}; const NGRAMS: usize = 3; struct Automaton { - query_index: usize, + index: usize, + ngram: usize, query_len: usize, is_exact: bool, - dfa: DFA, + is_prefix: bool, + query: String, } impl Automaton { - fn exact(query_index: usize, query: &str) -> Automaton { - Automaton { - query_index, - query_len: query.len(), - is_exact: true, - dfa: build_dfa(query), + fn dfa(&self) -> DFA { + if self.is_prefix { + build_prefix_dfa(&self.query) + } else { + build_dfa(&self.query) } } - fn prefix_exact(query_index: usize, query: &str) -> Automaton { + fn exact(index: usize, ngram: usize, query: &str) -> Automaton { Automaton { - query_index, + index, + ngram, query_len: query.len(), is_exact: true, - dfa: build_prefix_dfa(query), + is_prefix: false, + query: query.to_string(), } } - fn non_exact(query_index: usize, query: &str) -> Automaton { + fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton { Automaton { - query_index, + index, + ngram, + query_len: query.len(), + is_exact: true, + is_prefix: true, + query: query.to_string(), + } + } + + fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton { + Automaton { + index, + ngram, query_len: query.len(), is_exact: false, - dfa: build_dfa(query), + is_prefix: false, + query: query.to_string(), } } } @@ -68,41 +86,36 @@ pub fn normalize_str(string: &str) -> String { string } -fn split_best_frequency<'a, S: Store>( - word: &'a str, - store: &S, -) -> Result, S::Error> -{ - let chars = word.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = word.split_at(i); - - let left_freq = store.word_indexes(left.as_bytes())?.map_or(0, |i| i.len()); - let right_freq = store.word_indexes(right.as_bytes())?.map_or(0, |i| i.len()); - let min_freq = cmp::min(left_freq, right_freq); - - if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { - best = Some((min_freq, left, right)); - } - } - - Ok(best.map(|(_, l, r)| (l, r))) -} - -fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { +fn generate_automatons(query: &str, store: &S) -> Result<(Vec, QueryEnhancer), S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); - let mut automatons = Vec::new(); - let synonyms = store.synonyms()?; - for n in 1..=NGRAMS { - let mut query_index = 0; - let mut ngrams = query_words.windows(n).peekable(); + let mut automatons = Vec::new(); + let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); - while let Some(ngram_slice) = ngrams.next() { + // We must not declare the original words to the query enhancer + // *but* we need to push them in the automatons list first + let mut original_words = query_words.iter().peekable(); + while let Some(word) = original_words.next() { + + let has_following_word = original_words.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + + let automaton = if not_prefix_dfa { + Automaton::exact(automatons.len(), 1, word) + } else { + Automaton::prefix_exact(automatons.len(), 1, word) + }; + automatons.push(automaton); + } + + for n in 1..=NGRAMS { + + let mut ngrams = query_words.windows(n).enumerate().peekable(); + while let Some((query_index, ngram_slice)) = ngrams.next() { + + let query_range = query_index..query_index + n; let ngram_nb_words = ngram_slice.len(); let ngram = ngram_slice.join(" "); @@ -127,68 +140,44 @@ fn generate_automatons(query: &str, store: &S) -> Result = split_query_string(synonyms).collect(); + let nb_synonym_words = synonyms_words.len(); - for synonym in split_query_string(synonyms) { + let real_query_index = automatons.len(); + enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); + + for synonym in synonyms_words { let automaton = if nb_synonym_words == 1 { - Automaton::exact(query_index, synonym) + Automaton::exact(automatons.len(), n, synonym) } else { - Automaton::non_exact(query_index, synonym) + Automaton::non_exact(automatons.len(), n, synonym) }; - automatons.push((automaton, synonym.to_owned())); + automatons.push(automaton); } } } } - if n == 1 { - // TODO we do not support "phrase query" in other words: - // first term *must* follow the second term - if let Some((left, right)) = split_best_frequency(&ngram, store)? { - let automaton = Automaton::exact(query_index, left); - automatons.push((automaton, left.to_owned())); - - let automaton = Automaton::exact(query_index, right); - automatons.push((automaton, right.to_owned())); - } - - let automaton = if not_prefix_dfa { - Automaton::exact(query_index, &ngram) - } else { - Automaton::prefix_exact(query_index, &ngram) - }; - automatons.push((automaton, ngram)); - - } else { + if n != 1 { // automaton of concatenation of query words let concat = ngram_slice.concat(); let normalized = normalize_str(&concat); - let automaton = Automaton::exact(query_index, &normalized); - automatons.push((automaton, normalized)); - } - query_index += 1; + let real_query_index = automatons.len(); + enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); + + let automaton = Automaton::exact(automatons.len(), n, &normalized); + automatons.push(automaton); + } } } - automatons.sort_unstable_by(|a, b| (a.0.query_index, &a.1).cmp(&(b.0.query_index, &b.1))); - automatons.dedup_by(|a, b| (a.0.query_index, &a.1) == (b.0.query_index, &b.1)); - let automatons = automatons.into_iter().map(|(a, _)| a).collect(); + // order automatons, the most important first, + // we keep the original automatons at the front. + let original_len = query_words.len(); + automatons[original_len..].sort_unstable_by_key(|a| (Reverse(a.is_exact), Reverse(a.ngram))); - Ok(automatons) -} - -fn rewrite_matched_positions(matches: &mut [(DocumentId, TmpMatch, Highlight)]) { - for document_matches in matches.linear_group_by_mut(|(a, _, _), (b, _, _)| a == b) { - let mut offset = 0; - for query_indexes in document_matches.linear_group_by_mut(|(_, a, _), (_, b, _)| a.query_index == b.query_index) { - let word_index = query_indexes[0].1.word_index - offset as u16; - for (_, match_, _) in query_indexes.iter_mut() { - match_.word_index = word_index; - } - offset += query_indexes.len() - 1; - } - } + Ok((automatons, enhancer_builder.build())) } pub struct QueryBuilder<'c, S, FI = fn(DocumentId) -> bool> { @@ -196,6 +185,7 @@ pub struct QueryBuilder<'c, S, FI = fn(DocumentId) -> bool> { criteria: Criteria<'c>, searchable_attrs: Option, filter: Option, + fetch_timeout: Option, } impl<'c, S> QueryBuilder<'c, S, fn(DocumentId) -> bool> { @@ -204,7 +194,7 @@ impl<'c, S> QueryBuilder<'c, S, fn(DocumentId) -> bool> { } pub fn with_criteria(store: S, criteria: Criteria<'c>) -> Self { - QueryBuilder { store, criteria, searchable_attrs: None, filter: None } + QueryBuilder { store, criteria, searchable_attrs: None, filter: None, fetch_timeout: None } } } @@ -218,9 +208,14 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI> criteria: self.criteria, searchable_attrs: self.searchable_attrs, filter: Some(function), + fetch_timeout: self.fetch_timeout, } } + pub fn with_fetch_timeout(self, timeout: Duration) -> QueryBuilder<'c, S, FI> { + QueryBuilder { fetch_timeout: Some(timeout), ..self } + } + pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder<'c, S, FI, F> where F: Fn(DocumentId) -> Option, K: Hash + Eq, @@ -234,79 +229,251 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI> } } -impl<'c, S, FI> QueryBuilder<'c, S, FI> -where S: Store, +fn multiword_rewrite_matches( + mut matches: Vec<(DocumentId, TmpMatch)>, + query_enhancer: &QueryEnhancer, + timeout: Option, +) -> SetBuf<(DocumentId, TmpMatch)> { - fn query_all(&self, query: &str) -> Result, S::Error> { - let automatons = generate_automatons(query, &self.store)?; - let words = self.store.words()?.as_fst(); - let searchables = self.searchable_attrs.as_ref(); + let mut padded_matches = Vec::with_capacity(matches.len()); - let mut stream = { - let mut op_builder = fst::raw::OpBuilder::new(); - for Automaton { dfa, .. } in &automatons { - let stream = words.search(dfa); - op_builder.push(stream); - } - op_builder.r#union() - }; + // we sort the matches by word index to make them rewritable + let start = Instant::now(); + matches.par_sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); + info!("rewrite sort by word_index took {:.2?}", start.elapsed()); - let mut matches = Vec::new(); + let start = Instant::now(); + // for each attribute of each document + for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { - while let Some((input, indexed_values)) = stream.next() { - for iv in indexed_values { - let Automaton { query_index, is_exact, query_len, ref dfa } = automatons[iv.index]; - let distance = dfa.eval(input).to_u8(); - let is_exact = is_exact && distance == 0 && input.len() == query_len; - - let doc_indexes = self.store.word_indexes(input)?; - let doc_indexes = match doc_indexes { - Some(doc_indexes) => doc_indexes, - None => continue, - }; - - for di in doc_indexes.as_slice() { - let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); - if let Some(attribute) = attribute { - let match_ = TmpMatch { - query_index: query_index as u32, - distance, - attribute, - word_index: di.word_index, - is_exact, - }; - let highlight = Highlight { - attribute: di.attribute, - char_index: di.char_index, - char_length: di.char_length, - }; - matches.push((di.document_id, match_, highlight)); - } - } - } + let elapsed = start.elapsed(); + if timeout.map_or(false, |timeout| elapsed > timeout) { + info!("abort multiword rewrite after {:.2?}", elapsed); + break; } - // rewrite the matched positions for next criteria evaluations - matches.par_sort_unstable(); - rewrite_matched_positions(&mut matches); + // padding will only be applied + // to word indices in the same attribute + let mut padding = 0; + let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index); - let total_matches = matches.len(); - let padded_matches = { - matches.par_sort_unstable(); - matches.dedup(); - SetBuf::new_unchecked(matches) - }; - let raw_documents = raw_documents_from_matches(padded_matches); + // for each match at the same position + // in this document attribute + while let Some(same_word_index) = iter.next() { - info!("{} total documents to classify", raw_documents.len()); - info!("{} total matches to classify", total_matches); + // find the biggest padding + let mut biggest = 0; + for (id, match_) in same_word_index { - Ok(raw_documents) + let mut replacement = query_enhancer.replacement(match_.query_index); + let replacement_len = replacement.len(); + let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); + + if let Some(query_index) = replacement.next() { + let word_index = match_.word_index + padding as u16; + let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + padded_matches.push((*id, match_)); + } + + let mut found = false; + + // look ahead and if there already is a match + // corresponding to this padding word, abort the padding + 'padding: for (x, next_group) in nexts.enumerate() { + + for (i, query_index) in replacement.clone().enumerate().skip(x) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let padmatch = TmpMatch { query_index, word_index, ..match_.clone() }; + + for (_, nmatch_) in next_group { + let mut rep = query_enhancer.replacement(nmatch_.query_index); + let query_index = rep.next().unwrap(); + if query_index == padmatch.query_index { + + if !found { + // if we find a corresponding padding for the + // first time we must push preceding paddings + for (i, query_index) in replacement.clone().enumerate().take(i) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + padded_matches.push((*id, match_)); + biggest = biggest.max(i + 1); + } + } + + padded_matches.push((*id, padmatch)); + found = true; + continue 'padding; + } + } + } + + // if we do not find a corresponding padding in the + // next groups so stop here and pad what was found + break + } + + if !found { + // if no padding was found in the following matches + // we must insert the entire padding + for (i, query_index) in replacement.enumerate() { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + padded_matches.push((*id, match_)); + } + + biggest = biggest.max(replacement_len - 1); + } + } + + padding += biggest; + } + } + info!("main multiword rewrite took {:.2?}", start.elapsed()); + + let start = Instant::now(); + for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) { + document_matches.sort_unstable(); + } + info!("final rewrite sort took {:.2?}", start.elapsed()); + + SetBuf::new_unchecked(padded_matches) +} + +impl<'c, S, FI> QueryBuilder<'c, S, FI> +where S: Store + Sync, + S::Error: Send, +{ + fn query_all(&self, query: &str) -> Result, S::Error> { + let (automatons, query_enhancer) = generate_automatons(query, &self.store)?; + let searchables = self.searchable_attrs.as_ref(); + let store = &self.store; + let fetch_timeout = &self.fetch_timeout; + + rayon::scope(move |s| { + enum Error { + SendError, + StoreError(E), + } + + let mut matches = Vec::new(); + let mut highlights = Vec::new(); + + let recv_end_time = fetch_timeout.map(|d| Instant::now() + d * 75 / 100); + let start = Instant::now(); + + let (sender, receiver) = crossbeam_channel::unbounded(); + + s.spawn(move |_| { + let result = automatons + .into_iter() + .par_bridge() + .try_for_each_with((sender, store, searchables), |data, automaton| { + let (sender, store, searchables) = data; + let Automaton { index, is_exact, query_len, .. } = automaton; + let dfa = automaton.dfa(); + + let words = store.words().map_err(Error::StoreError)?; + let mut stream = words.search(&dfa).into_stream(); + + let mut matches = Vec::new(); + let mut highlights = Vec::new(); + + while let Some(input) = stream.next() { + let distance = dfa.eval(input).to_u8(); + let is_exact = is_exact && distance == 0 && input.len() == query_len; + + let doc_indexes = store.word_indexes(input).map_err(Error::StoreError)?; + let doc_indexes = match doc_indexes { + Some(doc_indexes) => doc_indexes, + None => continue, + }; + + matches.reserve(doc_indexes.len()); + highlights.reserve(doc_indexes.len()); + + for di in doc_indexes.as_slice() { + + let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); + if let Some(attribute) = attribute { + let match_ = TmpMatch { + query_index: index as u32, + distance, + attribute, + word_index: di.word_index, + is_exact, + }; + + let highlight = Highlight { + attribute: di.attribute, + char_index: di.char_index, + char_length: di.char_length, + }; + + matches.push((di.document_id, match_)); + highlights.push((di.document_id, highlight)); + } + } + } + + sender.send((matches, highlights)).map_err(|_| Error::SendError) + }); + + if let Err(Error::StoreError(e)) = result { + error!("{}", e); + } + }); + + let iter = receiver.recv().into_iter().chain(iter::from_fn(|| { + let recv_end_time = match recv_end_time { + Some(time) => time, + None => return receiver.recv().ok(), + }; + + match recv_end_time.checked_duration_since(Instant::now()) { + Some(timeout) => receiver.recv_timeout(timeout).ok(), + None => None, + } + })); + + for (mut rcv_matches, mut rcv_highlights) in iter { + matches.append(&mut rcv_matches); + highlights.append(&mut rcv_highlights); + } + + drop(receiver); + + info!("main query all took {:.2?}", start.elapsed()); + info!("{} total matches to rewrite", matches.len()); + + let start = Instant::now(); + let timeout = fetch_timeout.map(|d| d * 25 / 100); + let matches = multiword_rewrite_matches(matches, &query_enhancer, timeout); + info!("multiword rewrite took {:.2?}", start.elapsed()); + + let start = Instant::now(); + let highlights = { + highlights.par_sort_unstable_by_key(|(id, _)| *id); + SetBuf::new_unchecked(highlights) + }; + info!("sorting highlights took {:.2?}", start.elapsed()); + + info!("{} total matches to classify", matches.len()); + + let start = Instant::now(); + let raw_documents = raw_documents_from(matches, highlights); + info!("making raw documents took {:.2?}", start.elapsed()); + + info!("{} total documents to classify", raw_documents.len()); + + Ok(raw_documents) + }) } } impl<'c, S, FI> QueryBuilder<'c, S, FI> -where S: Store, +where S: Store + Sync, + S::Error: Send, FI: Fn(DocumentId) -> bool, { pub fn query(self, query: &str, range: Range) -> Result, S::Error> { @@ -373,7 +540,15 @@ impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> DistinctQueryBuilder { inner: self.inner.with_filter(function), function: self.function, - size: self.size + size: self.size, + } + } + + pub fn with_fetch_timeout(self, timeout: Duration) -> DistinctQueryBuilder<'c, I, FI, FD> { + DistinctQueryBuilder { + inner: self.inner.with_fetch_timeout(timeout), + function: self.function, + size: self.size, } } @@ -383,7 +558,8 @@ impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> } impl<'c, S, FI, FD, K> DistinctQueryBuilder<'c, S, FI, FD> -where S: Store, +where S: Store + Sync, + S::Error: Send, FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, K: Hash + Eq, @@ -839,17 +1015,22 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY ± new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY ± york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY ± city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -859,24 +1040,141 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC ± new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC ± york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC ± city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn unique_to_multiword_synonyms_words_proximity() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("subway", &[doc_char_index(0, 3, 3)][..]), + + ("york", &[doc_char_index(1, 0, 0)][..]), + ("new", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + + ("NY", &[doc_char_index(2, 0, 0)][..]), + ("subway", &[doc_char_index(2, 1, 1)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY ± york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // NY ± new + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // new = NY + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 1, .. })); // york = NY + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // new = NY + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // york + assert_matches!(matches.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 1, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // new + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn unique_to_multiword_synonyms_cumulative_word_index() { + let mut store = InMemorySetStore::from_iter(vec![ + ("NY", &[doc_char_index(0, 0, 0)][..]), + ("subway", &[doc_char_index(0, 1, 1)][..]), + + ("new", &[doc_char_index(1, 0, 0)][..]), + ("york", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym("new york", SetBuf::from_dirty(vec!["NY"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); } #[test] /// Unique word has multi-word synonyms - fn harder_unique_to_multiword_synonyms() { + fn harder_unique_to_multiword_synonyms_one() { let mut store = InMemorySetStore::from_iter(vec![ ("new", &[doc_char_index(0, 0, 0)][..]), ("york", &[doc_char_index(0, 1, 1)][..]), @@ -899,17 +1197,22 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway - assert_matches!(iter.next(), None); + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -919,16 +1222,22 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC + // because one-word to one-word ^^^^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -961,19 +1270,25 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // train = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 3, .. })); // broken - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway - assert_matches!(iter.next(), None); + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -983,18 +1298,25 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + // because one-word to one-word ^^^^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // train = subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); } @@ -1018,49 +1340,43 @@ mod tests { ("broken", &[doc_char_index(2, 4, 4)][..]), ]); - store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"])); - store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC", "NY", "new york"])); - store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"])); + store.add_synonym("new york", SetBuf::from_dirty(vec![ "NYC", "NY", "new york city" ])); + store.add_synonym("new york city", SetBuf::from_dirty(vec![ "NYC", "NY", "new york" ])); + store.add_synonym("underground train", SetBuf::from_dirty(vec![ "subway" ])); let builder = QueryBuilder::new(&store); let results = builder.query("new york underground train broken", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, highlights }) => { + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - let mut highlights = highlights.into_iter(); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new - assert_matches!(highlights.next(), Some(Highlight { char_index: 0, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // york - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 1, .. })); // underground - assert_matches!(highlights.next(), Some(Highlight { char_index: 2, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 2, .. })); // train - assert_matches!(highlights.next(), Some(Highlight { char_index: 3, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 3, .. })); // broken - assert_matches!(highlights.next(), Some(Highlight { char_index: 4, .. })); - + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train + assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NYC = new york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, .. })); // subway = underground train - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 3, .. })); // broken + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY = new york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 1, .. })); // subway = underground train + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1069,55 +1385,169 @@ mod tests { let results = builder.query("new york city underground train broken", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, highlights }) => { + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - let mut highlights = highlights.into_iter(); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new - assert_matches!(highlights.next(), Some(Highlight { char_index: 0, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // york - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 1, .. })); // underground - assert_matches!(highlights.next(), Some(Highlight { char_index: 2, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 2, .. })); // train - assert_matches!(highlights.next(), Some(Highlight { char_index: 3, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 3, .. })); // broken - assert_matches!(highlights.next(), Some(Highlight { char_index: 4, .. })); - + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train + assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NYC = new york city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 2, .. })); // subway = underground train - assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 3, .. })); // broken + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY = new york city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 1, .. })); // subway = underground train + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); } + #[test] + fn intercrossed_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_index(0, 0)][..]), + ("york", &[doc_index(0, 1)][..]), + ("big", &[doc_index(0, 2)][..]), + ("city", &[doc_index(0, 3)][..]), + ]); + + store.add_synonym("new york", SetBuf::from_dirty(vec![ "new york city" ])); + store.add_synonym("new york city", SetBuf::from_dirty(vec![ "new york" ])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york big ", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city + + assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let mut store = InMemorySetStore::from_iter(vec![ + ("NY", &[doc_index(0, 0)][..]), + ("city", &[doc_index(0, 1)][..]), + ("subway", &[doc_index(0, 2)][..]), + + ("NY", &[doc_index(1, 0)][..]), + ("subway", &[doc_index(1, 1)][..]), + + ("NY", &[doc_index(2, 0)][..]), + ("york", &[doc_index(2, 1)][..]), + ("city", &[doc_index(2, 2)][..]), + ("subway", &[doc_index(2, 3)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["new york city story"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway ", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // story + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn cumulative_word_indices() { + let mut store = InMemorySetStore::from_iter(vec![ + ("NYC", &[doc_index(0, 0)][..]), + ("long", &[doc_index(0, 1)][..]), + ("subway", &[doc_index(0, 2)][..]), + ("cool", &[doc_index(0, 3)][..]), + ]); + + store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"])); + store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york city long subway cool ", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // long + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(matches.next(), Some(TmpMatch { query_index: 6, word_index: 6, is_exact: true, .. })); // cool + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + #[test] fn deunicoded_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - ("iPhone", &[doc_index(0, 0)][..]), - ("telephone", &[doc_index(1, 0)][..]), // meilidb-data indexes the unidecoded - ("téléphone", &[doc_index(1, 0)][..]), // and the original words with the same DocIndex + ("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded + ("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex + + ("iphone", &[doc_index(1, 0)][..]), ]); - store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iPhone"])); + store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"])); let builder = QueryBuilder::new(&store); let results = builder.query("telephone", 0..20).unwrap(); @@ -1126,12 +1556,12 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1143,12 +1573,12 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1157,14 +1587,15 @@ mod tests { let results = builder.query("télephone", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, distance: 1, .. })); // téléphone + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // iphone + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // téléphone assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1184,56 +1615,11 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 0, distance: 1, .. })); // phone - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 1, distance: 0, .. })); // case + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); } - - #[test] - fn simple_split() { - let store = InMemorySetStore::from_iter(vec![ - ("porte", &[doc_char_index(0, 0, 0)][..]), - ("feuille", &[doc_char_index(0, 1, 1)][..]), - ("search", &[doc_char_index(1, 0, 0)][..]), - ("engine", &[doc_char_index(1, 1, 1)][..]), - ]); - - let builder = QueryBuilder::new(&store); - let results = builder.query("portefeuille", 0..20).unwrap(); - let mut iter = results.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, highlights }) => { - let mut matches = matches.into_iter(); - let mut highlights = highlights.into_iter(); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // porte - assert_matches!(highlights.next(), Some(Highlight { char_index: 0, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // feuille - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - - let builder = QueryBuilder::new(&store); - let results = builder.query("searchengine", 0..20).unwrap(); - let mut iter = results.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, highlights }) => { - let mut matches = matches.into_iter(); - let mut highlights = highlights.into_iter(); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // search - assert_matches!(highlights.next(), Some(Highlight { char_index: 0, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // engine - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - } } diff --git a/meilidb-core/src/query_enhancer.rs b/meilidb-core/src/query_enhancer.rs new file mode 100644 index 000000000..165c1b094 --- /dev/null +++ b/meilidb-core/src/query_enhancer.rs @@ -0,0 +1,398 @@ +use std::ops::Range; +use std::cmp::Ordering::{Less, Greater, Equal}; + +/// Return `true` if the specified range can accept the given replacements words. +/// Returns `false` if the replacements words are already present in the original query +/// or if there is fewer replacement words than the range to replace. +// +// +// ## Ignored because already present in original +// +// new york city subway +// -------- ^^^^ +// / \ +// [new york city] +// +// +// ## Ignored because smaller than the original +// +// new york city subway +// ------------- +// \ / +// [new york] +// +// +// ## Accepted because bigger than the original +// +// NYC subway +// --- +// / \ +// / \ +// / \ +// / \ +// / \ +// [new york city] +// +fn rewrite_range_with(query: &[S], range: Range, words: &[T]) -> bool +where S: AsRef, + T: AsRef, +{ + if words.len() <= range.len() { + // there is fewer or equal replacement words + // than there is already in the replaced range + return false + } + + // retrieve the part to rewrite but with the length + // of the replacement part + let original = query.iter().skip(range.start).take(words.len()); + + // check if the original query doesn't already contain + // the replacement words + !original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref)) +} + +type Origin = usize; +type RealLength = usize; + +struct FakeIntervalTree { + intervals: Vec<(Range, (Origin, RealLength))>, +} + +impl FakeIntervalTree { + fn new(mut intervals: Vec<(Range, (Origin, RealLength))>) -> FakeIntervalTree { + intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end)); + FakeIntervalTree { intervals } + } + + fn query(&self, point: usize) -> Option<(Range, (Origin, RealLength))> { + let element = self.intervals.binary_search_by(|(r, _)| { + if point >= r.start { + if point < r.end { Equal } else { Less } + } else { Greater } + }); + + let n = match element { Ok(n) => n, Err(n) => n }; + + match self.intervals.get(n) { + Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)), + _otherwise => None, + } + } +} + +pub struct QueryEnhancerBuilder<'a, S> { + query: &'a [S], + origins: Vec, + real_to_origin: Vec<(Range, (Origin, RealLength))>, +} + +impl> QueryEnhancerBuilder<'_, S> { + pub fn new(query: &[S]) -> QueryEnhancerBuilder { + // we initialize origins query indices based on their positions + let origins: Vec<_> = (0..query.len() + 1).collect(); + let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect(); + + QueryEnhancerBuilder { query, origins, real_to_origin } + } + + /// Update the final real to origin query indices mapping. + /// + /// `range` is the original words range that this `replacement` words replace + /// and `real` is the first real query index of these replacement words. + pub fn declare(&mut self, range: Range, real: usize, replacement: &[T]) + where T: AsRef, + { + // check if the range of original words + // can be rewritten with the replacement words + if rewrite_range_with(self.query, range.clone(), replacement) { + + // this range can be replaced so we need to + // modify the origins accordingly + let offset = replacement.len() - range.len(); + + let previous_padding = self.origins[range.end - 1]; + let current_offset = (self.origins[range.end] - 1) - previous_padding; + let diff = offset.saturating_sub(current_offset); + self.origins[range.end] += diff; + + for r in &mut self.origins[range.end + 1..] { + *r += diff; + } + } + + // we need to store the real number and origins relations + // this way it will be possible to know by how many + // we need to pad real query indices + let real_range = real..real + replacement.len().max(range.len()); + let real_length = replacement.len(); + self.real_to_origin.push((real_range, (range.start, real_length))); + } + + pub fn build(self) -> QueryEnhancer { + QueryEnhancer { + origins: self.origins, + real_to_origin: FakeIntervalTree::new(self.real_to_origin), + } + } +} + +pub struct QueryEnhancer { + origins: Vec, + real_to_origin: FakeIntervalTree, +} + +impl QueryEnhancer { + /// Returns the query indices to use to replace this real query index. + pub fn replacement(&self, real: u32) -> Range { + let real = real as usize; + + // query the fake interval tree with the real query index + let (range, (origin, real_length)) = + self.real_to_origin + .query(real) + .expect("real has never been declared"); + + // if `real` is the end bound of the range + if (range.start + real_length - 1) == real { + let mut count = range.len(); + let mut new_origin = origin; + for (i, slice) in self.origins[new_origin..].windows(2).enumerate() { + let len = slice[1] - slice[0]; + count = count.saturating_sub(len); + if count == 0 { new_origin = origin + i; break } + } + + let n = real - range.start; + let start = self.origins[origin]; + let end = self.origins[new_origin + 1]; + let remaining = (end - start) - n; + + Range { start: (start + n) as u32, end: (start + n + remaining) as u32 } + + } else { + // just return the origin along with + // the real position of the word + let n = real as usize - range.start; + let origin = self.origins[origin]; + + Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn original_unmodified() { + let query = ["new", "york", "city", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // new york = new york city + builder.declare(0..2, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // new + assert_eq!(enhancer.replacement(1), 1..2); // york + assert_eq!(enhancer.replacement(2), 2..3); // city + assert_eq!(enhancer.replacement(3), 3..4); // subway + assert_eq!(enhancer.replacement(4), 0..1); // new + assert_eq!(enhancer.replacement(5), 1..2); // york + assert_eq!(enhancer.replacement(6), 2..3); // city + } + + #[test] + fn simple_growing() { + let query = ["new", "york", "subway"]; + // 0 1 2 + let mut builder = QueryEnhancerBuilder::new(&query); + + // new york = new york city + builder.declare(0..2, 3, &["new", "york", "city"]); + // ^ 3 4 5 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // new + assert_eq!(enhancer.replacement(1), 1..3); // york + assert_eq!(enhancer.replacement(2), 3..4); // subway + assert_eq!(enhancer.replacement(3), 0..1); // new + assert_eq!(enhancer.replacement(4), 1..2); // york + assert_eq!(enhancer.replacement(5), 2..3); // city + } + + #[test] + fn same_place_growings() { + let query = ["NY", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NY = new york + builder.declare(0..1, 2, &["new", "york"]); + // ^ 2 3 + + // NY = new york city + builder.declare(0..1, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // NY = NYC + builder.declare(0..1, 7, &["NYC"]); + // ^ 7 + + // NY = new york city + builder.declare(0..1, 8, &["new", "york", "city"]); + // ^ 8 9 10 + + // subway = underground train + builder.declare(1..2, 11, &["underground", "train"]); + // ^ 11 12 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..3); // NY + assert_eq!(enhancer.replacement(1), 3..5); // subway + assert_eq!(enhancer.replacement(2), 0..1); // new + assert_eq!(enhancer.replacement(3), 1..3); // york + assert_eq!(enhancer.replacement(4), 0..1); // new + assert_eq!(enhancer.replacement(5), 1..2); // york + assert_eq!(enhancer.replacement(6), 2..3); // city + assert_eq!(enhancer.replacement(7), 0..3); // NYC + assert_eq!(enhancer.replacement(8), 0..1); // new + assert_eq!(enhancer.replacement(9), 1..2); // york + assert_eq!(enhancer.replacement(10), 2..3); // city + assert_eq!(enhancer.replacement(11), 3..4); // underground + assert_eq!(enhancer.replacement(12), 4..5); // train + } + + #[test] + fn bigger_growing() { + let query = ["NYC", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(0..1, 2, &["new", "york", "city"]); + // ^ 2 3 4 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..3); // NYC + assert_eq!(enhancer.replacement(1), 3..4); // subway + assert_eq!(enhancer.replacement(2), 0..1); // new + assert_eq!(enhancer.replacement(3), 1..2); // york + assert_eq!(enhancer.replacement(4), 2..3); // city + } + + #[test] + fn middle_query_growing() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..6); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + } + + #[test] + fn end_query_growing() { + let query = ["NYC", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(1..2, 2, &["underground", "train"]); + // ^ 2 3 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // NYC + assert_eq!(enhancer.replacement(1), 1..3); // subway + assert_eq!(enhancer.replacement(2), 1..2); // underground + assert_eq!(enhancer.replacement(3), 2..3); // train + } + + #[test] + fn multiple_growings() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // subway = underground train + builder.declare(3..4, 7, &["underground", "train"]); + // ^ 7 8 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..7); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + assert_eq!(enhancer.replacement(7), 5..6); // underground + assert_eq!(enhancer.replacement(8), 6..7); // train + } + + #[test] + fn multiple_probable_growings() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // subway = underground train + builder.declare(3..4, 7, &["underground", "train"]); + // ^ 7 8 + + // great awesome = good + builder.declare(0..2, 9, &["good"]); + // ^ 9 + + // awesome NYC = NY + builder.declare(1..3, 10, &["NY"]); + // ^^ 10 + + // NYC subway = metro + builder.declare(2..4, 11, &["metro"]); + // ^^ 11 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..7); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + assert_eq!(enhancer.replacement(7), 5..6); // underground + assert_eq!(enhancer.replacement(8), 6..7); // train + assert_eq!(enhancer.replacement(9), 0..2); // good + assert_eq!(enhancer.replacement(10), 1..5); // NY + assert_eq!(enhancer.replacement(11), 2..5); // metro + } +} diff --git a/meilidb-core/src/raw_document.rs b/meilidb-core/src/raw_document.rs new file mode 100644 index 000000000..3567c3fd1 --- /dev/null +++ b/meilidb-core/src/raw_document.rs @@ -0,0 +1,141 @@ +use std::sync::Arc; +use std::fmt; +use sdset::SetBuf; +use slice_group_by::GroupBy; +use crate::{TmpMatch, DocumentId, Highlight}; + +#[derive(Clone)] +pub struct RawDocument { + pub id: DocumentId, + pub matches: SharedMatches, + pub highlights: Vec, +} + +impl RawDocument { + fn new(id: DocumentId, matches: SharedMatches, highlights: Vec) -> RawDocument { + RawDocument { id, matches, highlights } + } + + pub fn query_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } + } + + pub fn distance(&self) -> &[u8] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } + } + + pub fn attribute(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } + } + + pub fn word_index(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } + } + + pub fn is_exact(&self) -> &[bool] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } + } +} + +impl fmt::Debug for RawDocument { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("RawDocument {\r\n")?; + f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?; + f.write_str("}")?; + Ok(()) + } +} + +pub fn raw_documents_from( + matches: SetBuf<(DocumentId, TmpMatch)>, + highlights: SetBuf<(DocumentId, Highlight)>, +) -> Vec +{ + let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); + let mut matches2 = Matches::with_capacity(matches.len()); + + let matches = matches.linear_group_by_key(|(id, _)| *id); + let highlights = highlights.linear_group_by_key(|(id, _)| *id); + + for (mgroup, hgroup) in matches.zip(highlights) { + debug_assert_eq!(mgroup[0].0, hgroup[0].0); + + let document_id = mgroup[0].0; + let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); + let end = start + mgroup.len(); + + let highlights = hgroup.iter().map(|(_, h)| *h).collect(); + docs_ranges.push((document_id, Range { start, end }, highlights)); + + matches2.extend_from_slice(mgroup); + } + + let matches = Arc::new(matches2); + docs_ranges.into_iter().map(|(id, range, highlights)| { + let matches = SharedMatches { range, matches: matches.clone() }; + RawDocument::new(id, matches, highlights) + }).collect() +} + +#[derive(Debug, Copy, Clone)] +struct Range { + start: usize, + end: usize, +} + +#[derive(Clone)] +pub struct SharedMatches { + range: Range, + matches: Arc, +} + +#[derive(Clone)] +struct Matches { + query_index: Vec, + distance: Vec, + attribute: Vec, + word_index: Vec, + is_exact: Vec, +} + +impl Matches { + fn with_capacity(cap: usize) -> Matches { + Matches { + query_index: Vec::with_capacity(cap), + distance: Vec::with_capacity(cap), + attribute: Vec::with_capacity(cap), + word_index: Vec::with_capacity(cap), + is_exact: Vec::with_capacity(cap), + } + } + + fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) { + for (_, match_) in matches { + self.query_index.push(match_.query_index); + self.distance.push(match_.distance); + self.attribute.push(match_.attribute); + self.word_index.push(match_.word_index); + self.is_exact.push(match_.is_exact); + } + } +} diff --git a/meilidb-core/src/reordered_attrs.rs b/meilidb-core/src/reordered_attrs.rs index ad7b2c324..ed11045ab 100644 --- a/meilidb-core/src/reordered_attrs.rs +++ b/meilidb-core/src/reordered_attrs.rs @@ -1,4 +1,4 @@ -#[derive(Default)] +#[derive(Default, Clone)] pub struct ReorderedAttrs { count: usize, reorders: Vec>, diff --git a/meilidb-data/src/database/synonyms_addition.rs b/meilidb-data/src/database/synonyms_addition.rs index 6e16ab97b..563cb228f 100644 --- a/meilidb-data/src/database/synonyms_addition.rs +++ b/meilidb-data/src/database/synonyms_addition.rs @@ -21,10 +21,10 @@ impl<'a> SynonymsAddition<'a> { pub fn add_synonym(&mut self, synonym: S, alternatives: I) where S: AsRef, T: AsRef, - I: Iterator, + I: IntoIterator, { let synonym = normalize_str(synonym.as_ref()); - let alternatives = alternatives.map(|s| s.as_ref().to_lowercase()); + let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase()); self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); } @@ -73,7 +73,7 @@ impl<'a> SynonymsAddition<'a> { // update the "consistent" view of the Index let words = main.words_set()?.unwrap_or_default(); - let ranked_map = lease_inner.ranked_map.clone();; + let ranked_map = lease_inner.ranked_map.clone(); let schema = lease_inner.schema.clone(); let raw = lease_inner.raw.clone(); lease_inner.raw.compact(); diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index 8ba89f212..0eecba0a1 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -14,10 +14,12 @@ csv = "1.0.7" diskus = "0.5.0" env_logger = "0.6.1" jemallocator = "0.1.9" +linked-hash-map = "0.5.2" meilidb-core = { path = "../meilidb-core", version = "0.1.0" } quickcheck = "0.8.2" rand = "0.6.5" rand_xorshift = "0.1.1" +rustyline = { version = "5.0.0", default-features = false } serde = { version = "1.0.91" , features = ["derive"] } serde_json = "1.0.39" structopt = "0.2.15" diff --git a/meilidb/examples/create-database.rs b/meilidb/examples/create-database.rs index ed07e3742..d8e553ed3 100644 --- a/meilidb/examples/create-database.rs +++ b/meilidb/examples/create-database.rs @@ -31,9 +31,13 @@ pub struct Opt { #[structopt(long = "schema", parse(from_os_str))] pub schema_path: PathBuf, + /// The file with the synonyms. + #[structopt(long = "synonyms", parse(from_os_str))] + pub synonyms: Option, + /// The path to the list of stop words (one by line). #[structopt(long = "stop-words", parse(from_os_str))] - pub stop_words_path: Option, + pub stop_words: Option, #[structopt(long = "update-group-size")] pub update_group_size: Option, @@ -45,12 +49,40 @@ struct Document<'a> ( HashMap, Cow<'a, str>> ); +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum Synonym { + OneWay(SynonymOneWay), + MultiWay { synonyms: Vec }, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SynonymOneWay { + pub search_terms: String, + pub synonyms: Synonyms, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum Synonyms { + Multiple(Vec), + Single(String), +} + +fn read_synomys(path: &Path) -> Result, Box> { + let file = File::open(path)?; + let synonyms = serde_json::from_reader(file)?; + Ok(synonyms) +} + fn index( schema: Schema, database_path: &Path, csv_data_path: &Path, update_group_size: Option, stop_words: &HashSet, + synonyms: Vec, ) -> Result> { let database = Database::start_default(database_path)?; @@ -62,6 +94,28 @@ fn index( let index = database.create_index("test", schema.clone())?; + let mut synonyms_adder = index.synonyms_addition(); + for synonym in synonyms { + match synonym { + Synonym::OneWay(SynonymOneWay { search_terms, synonyms }) => { + let alternatives = match synonyms { + Synonyms::Multiple(alternatives) => alternatives, + Synonyms::Single(alternative) => vec![alternative], + }; + synonyms_adder.add_synonym(search_terms, alternatives); + }, + Synonym::MultiWay { mut synonyms } => { + for _ in 0..synonyms.len() { + if let Some((synonym, alternatives)) = synonyms.split_first() { + synonyms_adder.add_synonym(synonym, alternatives); + } + synonyms.rotate_left(1); + } + }, + } + } + synonyms_adder.finalize()?; + let mut rdr = csv::Reader::from_path(csv_data_path)?; let mut raw_record = csv::StringRecord::new(); let headers = rdr.headers()?.clone(); @@ -133,13 +187,25 @@ fn main() -> Result<(), Box> { Schema::from_toml(file)? }; - let stop_words = match opt.stop_words_path { + let stop_words = match opt.stop_words { Some(ref path) => retrieve_stop_words(path)?, None => HashSet::new(), }; + let synonyms = match opt.synonyms { + Some(ref path) => read_synomys(path)?, + None => Vec::new(), + }; + let start = Instant::now(); - let result = index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words); + let result = index( + schema, + &opt.database_path, + &opt.csv_data_path, + opt.update_group_size, + &stop_words, + synonyms, + ); if let Err(e) = result { return Err(e.into()) diff --git a/meilidb/examples/query-database.rs b/meilidb/examples/query-database.rs index 72244d1b8..d939c0b70 100644 --- a/meilidb/examples/query-database.rs +++ b/meilidb/examples/query-database.rs @@ -2,17 +2,19 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; use std::collections::btree_map::{BTreeMap, Entry}; -use std::collections::{HashMap, HashSet}; -use std::iter::FromIterator; -use std::io::{self, Write}; -use std::time::{Instant, Duration}; -use std::path::PathBuf; +use std::collections::HashSet; use std::error::Error; +use std::io::{self, Write}; +use std::iter::FromIterator; +use std::path::PathBuf; +use std::time::{Instant, Duration}; -use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; +use linked_hash_map::LinkedHashMap; +use rustyline::{Editor, Config}; use structopt::StructOpt; -use meilidb_core::Highlight; +use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; +use meilidb_core::Highlight; use meilidb_data::Database; use meilidb_schema::SchemaAttr; @@ -22,6 +24,9 @@ pub struct Opt { #[structopt(parse(from_os_str))] pub database_path: PathBuf, + #[structopt(long = "fetch-timeout-ms")] + pub fetch_timeout_ms: Option, + /// Fields that must be displayed. pub displayed_fields: Vec, @@ -34,7 +39,7 @@ pub struct Opt { pub char_context: usize, } -type Document = HashMap; +type Document = LinkedHashMap; fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> { let mut stdout = StandardStream::stdout(ColorChoice::Always); @@ -140,9 +145,6 @@ fn main() -> Result<(), Box> { let start = Instant::now(); let database = Database::start_default(&opt.database_path)?; - let mut buffer = String::new(); - let input = io::stdin(); - let index = database.open_index("test")?.unwrap(); let schema = index.schema(); @@ -151,65 +153,77 @@ fn main() -> Result<(), Box> { let fields = opt.displayed_fields.iter().map(String::as_str); let fields = HashSet::from_iter(fields); - loop { - print!("Searching for: "); - io::stdout().flush()?; + let config = Config::builder().auto_add_history(true).build(); + let mut readline = Editor::<()>::with_config(config); + let _ = readline.load_history("query-history.txt"); - if input.read_line(&mut buffer)? == 0 { break } - let query = buffer.trim_end_matches('\n'); + for result in readline.iter("Searching for: ") { + match result { + Ok(query) => { + let start_total = Instant::now(); - let start_total = Instant::now(); + let builder = match opt.fetch_timeout_ms { + Some(timeout_ms) => { + let timeout = Duration::from_millis(timeout_ms); + index.query_builder().with_fetch_timeout(timeout) + }, + None => index.query_builder(), + }; + let documents = builder.query(&query, 0..opt.number_results)?; - let builder = index.query_builder(); - let documents = builder.query(query, 0..opt.number_results)?; + let mut retrieve_duration = Duration::default(); - let mut retrieve_duration = Duration::default(); + let number_of_documents = documents.len(); + for mut doc in documents { - let number_of_documents = documents.len(); - for mut doc in documents { + doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length)); - doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length)); + let start_retrieve = Instant::now(); + let result = index.document::(Some(&fields), doc.id); + retrieve_duration += start_retrieve.elapsed(); - let start_retrieve = Instant::now(); - let result = index.document::(Some(&fields), doc.id); - retrieve_duration += start_retrieve.elapsed(); + match result { + Ok(Some(document)) => { + for (name, text) in document { + print!("{}: ", name); - match result { - Ok(Some(document)) => { - for (name, text) in document { - print!("{}: ", name); - - let attr = schema.attribute(&name).unwrap(); - let highlights = doc.highlights.iter() - .filter(|m| SchemaAttr::new(m.attribute) == attr) - .cloned(); - let (text, highlights) = crop_text(&text, highlights, opt.char_context); - let areas = create_highlight_areas(&text, &highlights); - display_highlights(&text, &areas)?; - println!(); + let attr = schema.attribute(&name).unwrap(); + let highlights = doc.highlights.iter() + .filter(|m| SchemaAttr::new(m.attribute) == attr) + .cloned(); + let (text, highlights) = crop_text(&text, highlights, opt.char_context); + let areas = create_highlight_areas(&text, &highlights); + display_highlights(&text, &areas)?; + println!(); + } + }, + Ok(None) => eprintln!("missing document"), + Err(e) => eprintln!("{}", e), } - }, - Ok(None) => eprintln!("missing document"), - Err(e) => eprintln!("{}", e), + + let mut matching_attributes = HashSet::new(); + for highlight in doc.highlights { + let attr = SchemaAttr::new(highlight.attribute); + let name = schema.attribute_name(attr); + matching_attributes.insert(name); + } + + let matching_attributes = Vec::from_iter(matching_attributes); + println!("matching in: {:?}", matching_attributes); + + println!(); + } + + eprintln!("document field retrieve took {:.2?}", retrieve_duration); + eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed()); + }, + Err(err) => { + println!("Error: {:?}", err); + break } - - let mut matching_attributes = HashSet::new(); - for highlight in doc.highlights { - let attr = SchemaAttr::new(highlight.attribute); - let name = schema.attribute_name(attr); - matching_attributes.insert(name); - } - - let matching_attributes = Vec::from_iter(matching_attributes); - println!("matching in: {:?}", matching_attributes); - - println!(); } - - eprintln!("document field retrieve took {:.2?}", retrieve_duration); - eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed()); - buffer.clear(); } + readline.save_history("query-history.txt").unwrap(); Ok(()) }