From 5e691c2140a630acfd89541f6052820112ee714b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 7 Jul 2019 12:41:20 +0200 Subject: [PATCH 01/19] feat: Introduce the QueryEnhancer type --- meilidb-core/src/lib.rs | 1 + meilidb-core/src/query_builder.rs | 2 +- meilidb-core/src/query_enhancer.rs | 395 +++++++++++++++++++++++++++++ 3 files changed, 397 insertions(+), 1 deletion(-) create mode 100644 meilidb-core/src/query_enhancer.rs diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 0976fbde8..f5975e3b5 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -4,6 +4,7 @@ mod automaton; mod distinct_map; mod query_builder; +mod query_enhancer; mod reordered_attrs; mod store; pub mod criterion; diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 43da389a8..175430554 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -6,12 +6,12 @@ use std::{cmp, mem}; use fst::{Streamer, IntoStreamer}; use hashbrown::HashMap; +use levenshtein_automata::DFA; use log::info; use meilidb_tokenizer::{is_cjk, split_query_string}; use rayon::slice::ParallelSliceMut; use sdset::SetBuf; use slice_group_by::GroupByMut; -use levenshtein_automata::DFA; use crate::automaton::{build_dfa, build_prefix_dfa}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; diff --git a/meilidb-core/src/query_enhancer.rs b/meilidb-core/src/query_enhancer.rs new file mode 100644 index 000000000..6280ae11e --- /dev/null +++ b/meilidb-core/src/query_enhancer.rs @@ -0,0 +1,395 @@ +use std::ops::Range; +use std::cmp::Ordering::{Less, Greater, Equal}; + +/// Return `true` if the specified range can accept the given replacements words. +/// Returns `false` if the replacements words are already present in the original query +/// or if there is fewer replacement words than the range to replace. +// +// +// ## Ignored because already present in original +// +// new york city subway +// -------- ^^^^ +// / \ +// [new york city] +// +// +// ## Ignored because smaller than the original +// +// new york city subway +// ------------- +// \ / +// [new york] +// +// +// ## Accepted because bigger than the original +// +// NYC subway +// --- +// / \ +// / \ +// / \ +// / \ +// / \ +// [new york city] +// +fn rewrite_range_with(query: &[S], range: Range, words: &[T]) -> bool +where S: AsRef, + T: AsRef, +{ + if words.len() <= range.len() { + // there is fewer or equal replacement words + // than there is already in the replaced range + return false + } + + // retrieve the part to rewrite but with the length + // of the replacement part + let original = query.iter().skip(range.start).take(words.len()); + + // check if the original query doesn't already contain + // the replacement words + !original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref)) +} + +struct FakeIntervalTree { + intervals: Vec<(Range, (usize, usize))>, // origin, real_length +} + +impl FakeIntervalTree { + fn new(mut intervals: Vec<(Range, (usize, usize))>) -> FakeIntervalTree { + intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end)); + FakeIntervalTree { intervals } + } + + fn query(&self, point: usize) -> Option<(Range, (usize, usize))> { + let element = self.intervals.binary_search_by(|(r, _)| { + if point >= r.start { + if point < r.end { Equal } else { Less } + } else { Greater } + }); + + let n = match element { Ok(n) => n, Err(n) => n }; + + match self.intervals.get(n) { + Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)), + _otherwise => None, + } + } +} + +pub struct QueryEnhancerBuilder<'a, S> { + query: &'a [S], + origins: Vec, + real_to_origin: Vec<(Range, (usize, usize))>, +} + +impl> QueryEnhancerBuilder<'_, S> { + pub fn new(query: &[S]) -> QueryEnhancerBuilder { + // we initialize origins query indices based on their positions + let origins: Vec<_> = (0..query.len() + 1).collect(); + let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect(); + + QueryEnhancerBuilder { query, origins, real_to_origin } + } + + /// Update the final real to origin query indices mapping. + /// + /// `range` is the original words range that this `replacement` words replace + /// and `real` is the first real query index of these replacement words. + pub fn declare(&mut self, range: Range, real: usize, replacement: &[T]) + where T: AsRef, + { + // check if the range of original words + // can be rewritten with the replacement words + if rewrite_range_with(self.query, range.clone(), replacement) { + + // this range can be replaced so we need to + // modify the origins accordingly + let offset = replacement.len() - range.len(); + + let previous_padding = self.origins[range.end - 1]; + let current_offset = (self.origins[range.end] - 1) - previous_padding; + let diff = offset.saturating_sub(current_offset); + self.origins[range.end] += diff; + + for r in &mut self.origins[range.end + 1..] { + *r += diff; + } + } + + // we need to store the real number and origins relations + // this way it will be possible to know by how many + // we need to pad real query indices + let real_range = real..real + replacement.len().max(range.len()); + let real_length = replacement.len(); + self.real_to_origin.push((real_range, (range.start, real_length))); + } + + pub fn build(self) -> QueryEnhancer { + QueryEnhancer { + origins: self.origins, + real_to_origin: FakeIntervalTree::new(self.real_to_origin), + } + } +} + +pub struct QueryEnhancer { + origins: Vec, + real_to_origin: FakeIntervalTree, +} + +impl QueryEnhancer { + /// Returns the query indices to use to replace this real query index. + pub fn replacement(&self, real: u32) -> Range { + let real = real as usize; + + // query the fake interval tree with the real query index + let (range, (origin, real_length)) = + self.real_to_origin + .query(real) + .expect("real has never been declared"); + + // if `real` is the end bound of the range + if (range.start + real_length - 1) == real { + let mut count = range.len(); + let mut new_origin = origin; + for (i, slice) in self.origins[new_origin..].windows(2).enumerate() { + let len = slice[1] - slice[0]; + count = count.saturating_sub(len); + if count == 0 { new_origin = origin + i; break } + } + + let n = real - range.start; + let start = self.origins[origin]; + let end = self.origins[new_origin + 1]; + let remaining = (end - start) - n; + + Range { start: (start + n) as u32, end: (start + n + remaining) as u32 } + + } else { + // just return the origin along with + // the real position of the word + let n = real as usize - range.start; + let origin = self.origins[origin]; + + Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn original_unmodified() { + let query = ["new", "york", "city", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // new york = new york city + builder.declare(0..2, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // new + assert_eq!(enhancer.replacement(1), 1..2); // york + assert_eq!(enhancer.replacement(2), 2..3); // city + assert_eq!(enhancer.replacement(3), 3..4); // subway + assert_eq!(enhancer.replacement(4), 0..1); // new + assert_eq!(enhancer.replacement(5), 1..2); // york + assert_eq!(enhancer.replacement(6), 2..3); // city + } + + #[test] + fn simple_growing() { + let query = ["new", "york", "subway"]; + // 0 1 2 + let mut builder = QueryEnhancerBuilder::new(&query); + + // new york = new york city + builder.declare(0..2, 3, &["new", "york", "city"]); + // ^ 3 4 5 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // new + assert_eq!(enhancer.replacement(1), 1..3); // york + assert_eq!(enhancer.replacement(2), 3..4); // subway + assert_eq!(enhancer.replacement(3), 0..1); // new + assert_eq!(enhancer.replacement(4), 1..2); // york + assert_eq!(enhancer.replacement(5), 2..3); // city + } + + #[test] + fn same_place_growings() { + let query = ["NY", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NY = new york + builder.declare(0..1, 2, &["new", "york"]); + // ^ 2 3 + + // NY = new york city + builder.declare(0..1, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // NY = NYC + builder.declare(0..1, 7, &["NYC"]); + // ^ 7 + + // NY = new york city + builder.declare(0..1, 8, &["new", "york", "city"]); + // ^ 8 9 10 + + // subway = underground train + builder.declare(1..2, 11, &["underground", "train"]); + // ^ 11 12 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..3); // NY + assert_eq!(enhancer.replacement(1), 3..5); // subway + assert_eq!(enhancer.replacement(2), 0..1); // new + assert_eq!(enhancer.replacement(3), 1..3); // york + assert_eq!(enhancer.replacement(4), 0..1); // new + assert_eq!(enhancer.replacement(5), 1..2); // york + assert_eq!(enhancer.replacement(6), 2..3); // city + assert_eq!(enhancer.replacement(7), 0..3); // NYC + assert_eq!(enhancer.replacement(8), 0..1); // new + assert_eq!(enhancer.replacement(9), 1..2); // york + assert_eq!(enhancer.replacement(10), 2..3); // city + assert_eq!(enhancer.replacement(11), 3..4); // underground + assert_eq!(enhancer.replacement(12), 4..5); // train + } + + #[test] + fn bigger_growing() { + let query = ["NYC", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(0..1, 2, &["new", "york", "city"]); + // ^ 2 3 4 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..3); // NYC + assert_eq!(enhancer.replacement(1), 3..4); // subway + assert_eq!(enhancer.replacement(2), 0..1); // new + assert_eq!(enhancer.replacement(3), 1..2); // york + assert_eq!(enhancer.replacement(4), 2..3); // city + } + + #[test] + fn middle_query_growing() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..6); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + } + + #[test] + fn end_query_growing() { + let query = ["NYC", "subway"]; + // 0 1 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(1..2, 2, &["underground", "train"]); + // ^ 2 3 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // NYC + assert_eq!(enhancer.replacement(1), 1..3); // subway + assert_eq!(enhancer.replacement(2), 1..2); // underground + assert_eq!(enhancer.replacement(3), 2..3); // train + } + + #[test] + fn multiple_growings() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // subway = underground train + builder.declare(3..4, 7, &["underground", "train"]); + // ^ 7 8 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..7); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + assert_eq!(enhancer.replacement(7), 5..6); // underground + assert_eq!(enhancer.replacement(8), 6..7); // train + } + + #[test] + fn multiple_probable_growings() { + let query = ["great", "awesome", "NYC", "subway"]; + // 0 1 2 3 + let mut builder = QueryEnhancerBuilder::new(&query); + + // NYC = new york city + builder.declare(2..3, 4, &["new", "york", "city"]); + // ^ 4 5 6 + + // subway = underground train + builder.declare(3..4, 7, &["underground", "train"]); + // ^ 7 8 + + // great awesome = good + builder.declare(0..2, 9, &["good"]); + // ^ 9 + + // awesome NYC = NY + builder.declare(1..3, 10, &["NY"]); + // ^^ 10 + + // NYC subway = metro + builder.declare(2..4, 11, &["metro"]); + // ^^ 11 + + let enhancer = builder.build(); + + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..7); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + assert_eq!(enhancer.replacement(7), 5..6); // underground + assert_eq!(enhancer.replacement(8), 6..7); // train + assert_eq!(enhancer.replacement(9), 0..2); // good + assert_eq!(enhancer.replacement(10), 1..5); // NY + assert_eq!(enhancer.replacement(11), 2..5); // metro + } +} From f478bbf826a50f31cb5e40cae6f0ea95eed0d504 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 7 Jul 2019 20:27:37 +0200 Subject: [PATCH 02/19] feat: Introduce the QueryEnhancer in the query synonym system --- meilidb-core/src/query_builder.rs | 88 +++++++++++++++++++------------ 1 file changed, 54 insertions(+), 34 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 175430554..7e79ac15e 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -14,8 +14,9 @@ use sdset::SetBuf; use slice_group_by::GroupByMut; use crate::automaton::{build_dfa, build_prefix_dfa}; -use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; +use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; +use crate::query_enhancer::{QueryEnhancerBuilder, QueryEnhancer}; use crate::raw_documents_from_matches; use crate::reordered_attrs::ReorderedAttrs; use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document}; @@ -91,18 +92,36 @@ fn split_best_frequency<'a, S: Store>( Ok(best.map(|(_, l, r)| (l, r))) } -fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { +fn generate_automatons(query: &str, store: &S) -> Result<(Vec, QueryEnhancer), S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); - let mut automatons = Vec::new(); - let synonyms = store.synonyms()?; - for n in 1..=NGRAMS { - let mut query_index = 0; - let mut ngrams = query_words.windows(n).peekable(); + let mut automatons = Vec::new(); + let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); - while let Some(ngram_slice) = ngrams.next() { + // We must not declare the original words to the query enhancer + // *but* we need to push them in the automatons list first + let mut original_words = query_words.iter().enumerate().peekable(); + while let Some((query_index, word)) = original_words.next() { + + let has_following_word = original_words.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + + let automaton = if not_prefix_dfa { + Automaton::exact(query_index, word) + } else { + Automaton::prefix_exact(query_index, word) + }; + automatons.push(automaton); + } + + for n in 1..=NGRAMS { + + let mut ngrams = query_words.windows(n).enumerate().peekable(); + while let Some((query_index, ngram_slice)) = ngrams.next() { + + let query_range = query_index..query_index + n; let ngram_nb_words = ngram_slice.len(); let ngram = ngram_slice.join(" "); @@ -127,15 +146,19 @@ fn generate_automatons(query: &str, store: &S) -> Result = split_query_string(synonyms).collect(); + let nb_synonym_words = synonyms_words.len(); - for synonym in split_query_string(synonyms) { + let real_query_index = automatons.len(); + enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); + + for (i, synonym) in synonyms_words.into_iter().enumerate() { let automaton = if nb_synonym_words == 1 { - Automaton::exact(query_index, synonym) + Automaton::exact(real_query_index + i, synonym) } else { - Automaton::non_exact(query_index, synonym) + Automaton::non_exact(real_query_index + i, synonym) }; - automatons.push((automaton, synonym.to_owned())); + automatons.push(automaton); } } } @@ -145,37 +168,34 @@ fn generate_automatons(query: &str, store: &S) -> Result QueryBuilder<'c, S, FI> where S: Store, { fn query_all(&self, query: &str) -> Result, S::Error> { - let automatons = generate_automatons(query, &self.store)?; + let (automatons, query_enhancer) = generate_automatons(query, &self.store)?; let words = self.store.words()?.as_fst(); let searchables = self.searchable_attrs.as_ref(); From e65d7418b7d06024a1b9a0e3ce42358f8ef3fdaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 12 Jul 2019 16:05:15 +0200 Subject: [PATCH 03/19] feat: Remove the query index from the Automaton type --- meilidb-core/src/query_builder.rs | 162 ++++++++++++++++++++++-------- 1 file changed, 120 insertions(+), 42 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 7e79ac15e..e88ff8deb 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -11,7 +11,7 @@ use log::info; use meilidb_tokenizer::{is_cjk, split_query_string}; use rayon::slice::ParallelSliceMut; use sdset::SetBuf; -use slice_group_by::GroupByMut; +use slice_group_by::{GroupBy, GroupByMut}; use crate::automaton::{build_dfa, build_prefix_dfa}; use crate::criterion::Criteria; @@ -24,34 +24,30 @@ use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document}; const NGRAMS: usize = 3; struct Automaton { - query_index: usize, query_len: usize, is_exact: bool, dfa: DFA, } impl Automaton { - fn exact(query_index: usize, query: &str) -> Automaton { + fn exact(query: &str) -> Automaton { Automaton { - query_index, query_len: query.len(), is_exact: true, dfa: build_dfa(query), } } - fn prefix_exact(query_index: usize, query: &str) -> Automaton { + fn prefix_exact(query: &str) -> Automaton { Automaton { - query_index, query_len: query.len(), is_exact: true, dfa: build_prefix_dfa(query), } } - fn non_exact(query_index: usize, query: &str) -> Automaton { + fn non_exact(query: &str) -> Automaton { Automaton { - query_index, query_len: query.len(), is_exact: false, dfa: build_dfa(query), @@ -102,16 +98,16 @@ fn generate_automatons(query: &str, store: &S) -> Result<(Vec(query: &str, store: &S) -> Result<(Vec(query: &str, store: &S) -> Result<(Vec(query: &str, store: &S) -> Result<(Vec(query: &str, store: &S) -> Result<(Vec bool> { store: S, criteria: Criteria<'c>, @@ -275,7 +258,7 @@ where S: Store, while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { - let Automaton { query_index, is_exact, query_len, ref dfa } = automatons[iv.index]; + let Automaton { is_exact, query_len, ref dfa } = automatons[iv.index]; let distance = dfa.eval(input).to_u8(); let is_exact = is_exact && distance == 0 && input.len() == query_len; @@ -288,34 +271,129 @@ where S: Store, for di in doc_indexes.as_slice() { let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); if let Some(attribute) = attribute { + let match_ = TmpMatch { - query_index: query_index as u32, + query_index: iv.index as u32, distance, attribute, word_index: di.word_index, is_exact, }; + + // TODO do not store in the same matches vec let highlight = Highlight { attribute: di.attribute, char_index: di.char_index, char_length: di.char_length, }; + matches.push((di.document_id, match_, highlight)); } } } } - // rewrite the matched positions for next criteria evaluations - matches.par_sort_unstable(); - rewrite_matched_positions(&mut matches); + // we sort the matches to make them rewritable + matches.par_sort_unstable_by_key(|(id, match_, _)| { + (*id, match_.attribute, match_.word_index) // query_id ??? + }); + + let mut padded_matches = Vec::with_capacity(matches.len()); + for same_document in matches.linear_group_by(|a, b| a.0 == b.0) { + + for same_attribute in same_document.linear_group_by(|a, b| a.1.attribute == b.1.attribute) { + + let mut padding = 0; + let mut iter = same_attribute.linear_group_by(|a, b| a.1.word_index == b.1.word_index); + while let Some(same_word_index) = iter.next() { + + let mut biggest = 0; + for (id, match_, highlight) in same_word_index { + + let mut replacement = query_enhancer.replacement(match_.query_index); + let replacement_len = replacement.len() - 1; + let nexts = iter.remainder().linear_group_by(|a, b| a.1.word_index == b.1.word_index); + + if let Some(query_index) = replacement.next() { + let match_ = TmpMatch { + query_index, + word_index: match_.word_index + padding as u16, + ..match_.clone() + }; + padded_matches.push((*id, match_, *highlight)); + } + + let mut found = false; + + // look ahead and if there already is a match + // corresponding to this padding word, abort the padding + 'padding: for (x, next_group) in nexts.enumerate() { + + for (i, query_index) in replacement.clone().enumerate().skip(x) { + let padmatch_ = TmpMatch { + query_index, + word_index: match_.word_index + padding as u16 + (i + 1) as u16, + ..match_.clone() + }; + + for (_, nmatch_, _) in next_group { + let mut rep = query_enhancer.replacement(nmatch_.query_index); + let query_index = rep.next().unwrap(); + let nmatch_ = TmpMatch { query_index, ..nmatch_.clone() }; + if nmatch_.query_index == padmatch_.query_index { + + if !found { + // if we find a corresponding padding for the + // first time we must push preceding paddings + for (i, query_index) in replacement.clone().enumerate().take(i) { + let match_ = TmpMatch { + query_index, + word_index: match_.word_index + padding as u16 + (i + 1) as u16, + ..match_.clone() + }; + padded_matches.push((*id, match_, *highlight)); + biggest = biggest.max(i + 1); + } + } + + padded_matches.push((*id, padmatch_, *highlight)); + found = true; + continue 'padding; + } + } + } + + // if we do not find a corresponding padding in the + // next groups so stop here and pad what was found + break + } + + if !found { + // if no padding was found in the following matches + // we must insert the entire padding + for (i, query_index) in replacement.enumerate() { + let match_ = TmpMatch { + query_index, + word_index: match_.word_index + padding as u16 + (i + 1) as u16, + ..match_.clone() + }; + padded_matches.push((*id, match_, *highlight)); + } + + biggest = biggest.max(replacement_len); + } + } + + padding += biggest; + } + } + + } + + let total_matches = padded_matches.len(); + padded_matches.par_sort_unstable(); + let padded_matches = SetBuf::new_unchecked(padded_matches); - let total_matches = matches.len(); - let padded_matches = { - matches.par_sort_unstable(); - matches.dedup(); - SetBuf::new_unchecked(matches) - }; let raw_documents = raw_documents_from_matches(padded_matches); info!("{} total documents to classify", raw_documents.len()); From 225a3bf184ae00697e18062b76e82be5eac92fb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 7 Jul 2019 19:57:42 +0200 Subject: [PATCH 04/19] test: Produce tests that work with the new cumulative word index system --- meilidb-core/src/query_builder.rs | 484 +++++++++++++++++++++++------- 1 file changed, 370 insertions(+), 114 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index e88ff8deb..5268edd27 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -937,17 +937,22 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY ± new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY ± york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY ± city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -957,24 +962,141 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC ± new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC ± york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC ± city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn unique_to_multiword_synonyms_words_proximity() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("subway", &[doc_char_index(0, 3, 3)][..]), + + ("york", &[doc_char_index(1, 0, 0)][..]), + ("new", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + + ("NY", &[doc_char_index(2, 0, 0)][..]), + ("subway", &[doc_char_index(2, 1, 1)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY ± york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // NY ± new + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // new = NY + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 1, .. })); // york = NY + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // new = NY + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // york + assert_matches!(matches.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 1, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // new + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn unique_to_multiword_synonyms_cumulative_word_index() { + let mut store = InMemorySetStore::from_iter(vec![ + ("NY", &[doc_char_index(0, 0, 0)][..]), + ("subway", &[doc_char_index(0, 1, 1)][..]), + + ("new", &[doc_char_index(1, 0, 0)][..]), + ("york", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym("new york", SetBuf::from_dirty(vec!["NY"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); } #[test] /// Unique word has multi-word synonyms - fn harder_unique_to_multiword_synonyms() { + fn harder_unique_to_multiword_synonyms_one() { let mut store = InMemorySetStore::from_iter(vec![ ("new", &[doc_char_index(0, 0, 0)][..]), ("york", &[doc_char_index(0, 1, 1)][..]), @@ -997,17 +1119,22 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway - assert_matches!(iter.next(), None); + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -1017,16 +1144,22 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC + // because one-word to one-word ^^^^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -1059,19 +1192,25 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // train = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 3, .. })); // broken - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway - assert_matches!(iter.next(), None); + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -1081,18 +1220,25 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + // because one-word to one-word ^^^^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 2, .. })); // train = subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); } @@ -1116,49 +1262,43 @@ mod tests { ("broken", &[doc_char_index(2, 4, 4)][..]), ]); - store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"])); - store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC", "NY", "new york"])); - store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"])); + store.add_synonym("new york", SetBuf::from_dirty(vec![ "NYC", "NY", "new york city" ])); + store.add_synonym("new york city", SetBuf::from_dirty(vec![ "NYC", "NY", "new york" ])); + store.add_synonym("underground train", SetBuf::from_dirty(vec![ "subway" ])); let builder = QueryBuilder::new(&store); let results = builder.query("new york underground train broken", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, highlights }) => { + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - let mut highlights = highlights.into_iter(); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new - assert_matches!(highlights.next(), Some(Highlight { char_index: 0, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // york - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 1, .. })); // underground - assert_matches!(highlights.next(), Some(Highlight { char_index: 2, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 2, .. })); // train - assert_matches!(highlights.next(), Some(Highlight { char_index: 3, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 3, .. })); // broken - assert_matches!(highlights.next(), Some(Highlight { char_index: 4, .. })); - + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train + assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NYC = new york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, .. })); // subway = underground train - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 3, .. })); // broken + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY = new york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 1, .. })); // subway = underground train + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1167,55 +1307,169 @@ mod tests { let results = builder.query("new york city underground train broken", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, highlights }) => { + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - let mut highlights = highlights.into_iter(); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new - assert_matches!(highlights.next(), Some(Highlight { char_index: 0, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // york - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 1, .. })); // underground - assert_matches!(highlights.next(), Some(Highlight { char_index: 2, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 2, .. })); // train - assert_matches!(highlights.next(), Some(Highlight { char_index: 3, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 3, .. })); // broken - assert_matches!(highlights.next(), Some(Highlight { char_index: 4, .. })); - + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train + assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NYC = new york city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 2, .. })); // subway = underground train - assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 3, .. })); // broken + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY = new york city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 1, .. })); // subway = underground train + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city + assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); } + #[test] + fn intercrossed_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_index(0, 0)][..]), + ("york", &[doc_index(0, 1)][..]), + ("big", &[doc_index(0, 2)][..]), + ("city", &[doc_index(0, 3)][..]), + ]); + + store.add_synonym("new york", SetBuf::from_dirty(vec![ "new york city" ])); + store.add_synonym("new york city", SetBuf::from_dirty(vec![ "new york" ])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york big ", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city + + assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + + let mut store = InMemorySetStore::from_iter(vec![ + ("NY", &[doc_index(0, 0)][..]), + ("city", &[doc_index(0, 1)][..]), + ("subway", &[doc_index(0, 2)][..]), + + ("NY", &[doc_index(1, 0)][..]), + ("subway", &[doc_index(1, 1)][..]), + + ("NY", &[doc_index(2, 0)][..]), + ("york", &[doc_index(2, 1)][..]), + ("city", &[doc_index(2, 2)][..]), + ("subway", &[doc_index(2, 3)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["new york city story"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway ", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // story + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn cumulative_word_indices() { + let mut store = InMemorySetStore::from_iter(vec![ + ("NYC", &[doc_index(0, 0)][..]), + ("long", &[doc_index(0, 1)][..]), + ("subway", &[doc_index(0, 2)][..]), + ("cool", &[doc_index(0, 3)][..]), + ]); + + store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"])); + store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york city long subway cool ", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // long + assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(matches.next(), Some(TmpMatch { query_index: 6, word_index: 6, is_exact: true, .. })); // cool + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + #[test] fn deunicoded_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - ("iPhone", &[doc_index(0, 0)][..]), - ("telephone", &[doc_index(1, 0)][..]), // meilidb-data indexes the unidecoded - ("téléphone", &[doc_index(1, 0)][..]), // and the original words with the same DocIndex + ("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded + ("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex + + ("iphone", &[doc_index(1, 0)][..]), ]); - store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iPhone"])); + store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"])); let builder = QueryBuilder::new(&store); let results = builder.query("telephone", 0..20).unwrap(); @@ -1224,12 +1478,12 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1241,12 +1495,12 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1255,14 +1509,15 @@ mod tests { let results = builder.query("télephone", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, distance: 1, .. })); // téléphone + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // iphone + assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // téléphone assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1282,8 +1537,9 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone + assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 0, distance: 1, .. })); // phone - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 1, distance: 0, .. })); // case + assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1309,7 +1565,7 @@ mod tests { assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // porte assert_matches!(highlights.next(), Some(Highlight { char_index: 0, .. })); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // feuille + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // feuille assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); assert_matches!(matches.next(), None); @@ -1327,7 +1583,7 @@ mod tests { assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // search assert_matches!(highlights.next(), Some(Highlight { char_index: 0, .. })); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // engine + assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // engine assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); assert_matches!(matches.next(), None); From 795557c046110873f636327ec926249346d0c093 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 15 Jul 2019 14:28:40 +0200 Subject: [PATCH 05/19] feat: Remove query splitting from the automaton generation --- meilidb-core/src/query_builder.rs | 88 +------------------------------ 1 file changed, 1 insertion(+), 87 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 5268edd27..1fb778094 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -65,29 +65,6 @@ pub fn normalize_str(string: &str) -> String { string } -fn split_best_frequency<'a, S: Store>( - word: &'a str, - store: &S, -) -> Result, S::Error> -{ - let chars = word.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = word.split_at(i); - - let left_freq = store.word_indexes(left.as_bytes())?.map_or(0, |i| i.len()); - let right_freq = store.word_indexes(right.as_bytes())?.map_or(0, |i| i.len()); - let min_freq = cmp::min(left_freq, right_freq); - - if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { - best = Some((min_freq, left, right)); - } - } - - Ok(best.map(|(_, l, r)| (l, r))) -} - fn generate_automatons(query: &str, store: &S) -> Result<(Vec, QueryEnhancer), S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); @@ -160,24 +137,7 @@ fn generate_automatons(query: &str, store: &S) -> Result<(Vec { - let mut matches = matches.into_iter(); - let mut highlights = highlights.into_iter(); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // porte - assert_matches!(highlights.next(), Some(Highlight { char_index: 0, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // feuille - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - - let builder = QueryBuilder::new(&store); - let results = builder.query("searchengine", 0..20).unwrap(); - let mut iter = results.into_iter(); - - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, highlights }) => { - let mut matches = matches.into_iter(); - let mut highlights = highlights.into_iter(); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // search - assert_matches!(highlights.next(), Some(Highlight { char_index: 0, .. })); - - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // engine - assert_matches!(highlights.next(), Some(Highlight { char_index: 1, .. })); - - assert_matches!(matches.next(), None); - }); - assert_matches!(iter.next(), None); - } } From 9959f2e952bdef34566541628cfec734e5d7ece3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 7 Jul 2019 19:55:15 +0200 Subject: [PATCH 06/19] feat: Move the RawDocument type to its own module --- meilidb-core/src/lib.rs | 135 +------------------------------ meilidb-core/src/raw_document.rs | 132 ++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 131 deletions(-) create mode 100644 meilidb-core/src/raw_document.rs diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index f5975e3b5..b1a682e40 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -5,19 +5,18 @@ mod automaton; mod distinct_map; mod query_builder; mod query_enhancer; +mod raw_document; mod reordered_attrs; mod store; pub mod criterion; -use std::fmt; -use std::sync::Arc; - -use sdset::SetBuf; use serde::{Serialize, Deserialize}; -use slice_group_by::GroupBy; use zerocopy::{AsBytes, FromBytes}; +use self::raw_document::raw_documents_from_matches; + pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str}; +pub use self::raw_document::RawDocument; pub use self::store::Store; /// Represent an internally generated document unique identifier. @@ -131,132 +130,6 @@ impl Document { } } -#[derive(Clone)] -pub struct RawDocument { - pub id: DocumentId, - pub matches: SharedMatches, - pub highlights: Vec, -} - -impl RawDocument { - fn new(id: DocumentId, matches: SharedMatches, highlights: Vec) -> RawDocument { - RawDocument { id, matches, highlights } - } - - pub fn query_index(&self) -> &[u32] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } - } - - pub fn distance(&self) -> &[u8] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } - } - - pub fn attribute(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } - } - - pub fn word_index(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } - } - - pub fn is_exact(&self) -> &[bool] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } - } -} - -impl fmt::Debug for RawDocument { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("RawDocument") - .field("id", &self.id) - .field("query_index", &self.query_index()) - .field("distance", &self.distance()) - .field("attribute", &self.attribute()) - .field("word_index", &self.word_index()) - .field("is_exact", &self.is_exact()) - .finish() - } -} - -fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec { - let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); - let mut matches2 = Matches::with_capacity(matches.len()); - - for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) { - let document_id = group[0].0; - let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); - let end = start + group.len(); - - let highlights = group.iter().map(|(_, _, h)| *h).collect(); - docs_ranges.push((document_id, Range { start, end }, highlights)); - - matches2.extend_from_slice(group); - } - - let matches = Arc::new(matches2); - docs_ranges.into_iter().map(|(i, range, highlights)| { - let matches = SharedMatches { range, matches: matches.clone() }; - RawDocument::new(i, matches, highlights) - }).collect() -} - -#[derive(Debug, Copy, Clone)] -struct Range { - start: usize, - end: usize, -} - -#[derive(Clone)] -pub struct SharedMatches { - range: Range, - matches: Arc, -} - -#[derive(Clone)] -struct Matches { - query_index: Vec, - distance: Vec, - attribute: Vec, - word_index: Vec, - is_exact: Vec, -} - -impl Matches { - fn with_capacity(cap: usize) -> Matches { - Matches { - query_index: Vec::with_capacity(cap), - distance: Vec::with_capacity(cap), - attribute: Vec::with_capacity(cap), - word_index: Vec::with_capacity(cap), - is_exact: Vec::with_capacity(cap), - } - } - - fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) { - for (_, match_, _) in matches { - self.query_index.push(match_.query_index); - self.distance.push(match_.distance); - self.attribute.push(match_.attribute); - self.word_index.push(match_.word_index); - self.is_exact.push(match_.is_exact); - } - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/meilidb-core/src/raw_document.rs b/meilidb-core/src/raw_document.rs new file mode 100644 index 000000000..7a293439e --- /dev/null +++ b/meilidb-core/src/raw_document.rs @@ -0,0 +1,132 @@ +use std::sync::Arc; +use std::fmt; +use sdset::SetBuf; +use slice_group_by::GroupBy; +use crate::{TmpMatch, DocumentId, Highlight}; + +#[derive(Clone)] +pub struct RawDocument { + pub id: DocumentId, + pub matches: SharedMatches, + pub highlights: Vec, +} + +impl RawDocument { + fn new(id: DocumentId, matches: SharedMatches, highlights: Vec) -> RawDocument { + RawDocument { id, matches, highlights } + } + + pub fn query_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } + } + + pub fn distance(&self) -> &[u8] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } + } + + pub fn attribute(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } + } + + pub fn word_index(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } + } + + pub fn is_exact(&self) -> &[bool] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } + } +} + +impl fmt::Debug for RawDocument { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("RawDocument {\r\n")?; + f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?; + f.write_str("}")?; + Ok(()) + } +} + +pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec { + let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); + let mut matches2 = Matches::with_capacity(matches.len()); + + for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) { + let document_id = group[0].0; + let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); + let end = start + group.len(); + + let highlights = group.iter().map(|(_, _, h)| *h).collect(); + docs_ranges.push((document_id, Range { start, end }, highlights)); + + matches2.extend_from_slice(group); + } + + let matches = Arc::new(matches2); + docs_ranges.into_iter().map(|(i, range, highlights)| { + let matches = SharedMatches { range, matches: matches.clone() }; + RawDocument::new(i, matches, highlights) + }).collect() +} + +#[derive(Debug, Copy, Clone)] +struct Range { + start: usize, + end: usize, +} + +#[derive(Clone)] +pub struct SharedMatches { + range: Range, + matches: Arc, +} + +#[derive(Clone)] +struct Matches { + query_index: Vec, + distance: Vec, + attribute: Vec, + word_index: Vec, + is_exact: Vec, +} + +impl Matches { + fn with_capacity(cap: usize) -> Matches { + Matches { + query_index: Vec::with_capacity(cap), + distance: Vec::with_capacity(cap), + attribute: Vec::with_capacity(cap), + word_index: Vec::with_capacity(cap), + is_exact: Vec::with_capacity(cap), + } + } + + fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) { + for (_, match_, _) in matches { + self.query_index.push(match_.query_index); + self.distance.push(match_.distance); + self.attribute.push(match_.attribute); + self.word_index.push(match_.word_index); + self.is_exact.push(match_.is_exact); + } + } +} From 89df496f0cd18a8322e7a7f9113536f3684fb3f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 15 Jul 2019 19:34:53 +0200 Subject: [PATCH 07/19] feat: Separate highlights from matches to make the code easier to follow --- meilidb-core/src/lib.rs | 2 +- meilidb-core/src/query_builder.rs | 39 ++++++++++++++++++------------- meilidb-core/src/raw_document.rs | 31 +++++++++++++++--------- 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index b1a682e40..6f6e46359 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -13,7 +13,7 @@ pub mod criterion; use serde::{Serialize, Deserialize}; use zerocopy::{AsBytes, FromBytes}; -use self::raw_document::raw_documents_from_matches; +use self::raw_document::raw_documents_from; pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str}; pub use self::raw_document::RawDocument; diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 1fb778094..c5a0ac847 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -17,7 +17,7 @@ use crate::automaton::{build_dfa, build_prefix_dfa}; use crate::criterion::Criteria; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::query_enhancer::{QueryEnhancerBuilder, QueryEnhancer}; -use crate::raw_documents_from_matches; +use crate::raw_documents_from; use crate::reordered_attrs::ReorderedAttrs; use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document}; @@ -215,6 +215,7 @@ where S: Store, }; let mut matches = Vec::new(); + let mut highlights = Vec::new(); while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { @@ -240,23 +241,21 @@ where S: Store, is_exact, }; - // TODO do not store in the same matches vec let highlight = Highlight { attribute: di.attribute, char_index: di.char_index, char_length: di.char_length, }; - matches.push((di.document_id, match_, highlight)); + matches.push((di.document_id, match_)); + highlights.push((di.document_id, highlight)); } } } } // we sort the matches to make them rewritable - matches.par_sort_unstable_by_key(|(id, match_, _)| { - (*id, match_.attribute, match_.word_index) // query_id ??? - }); + matches.par_sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); let mut padded_matches = Vec::with_capacity(matches.len()); for same_document in matches.linear_group_by(|a, b| a.0 == b.0) { @@ -268,7 +267,7 @@ where S: Store, while let Some(same_word_index) = iter.next() { let mut biggest = 0; - for (id, match_, highlight) in same_word_index { + for (id, match_) in same_word_index { let mut replacement = query_enhancer.replacement(match_.query_index); let replacement_len = replacement.len() - 1; @@ -280,7 +279,7 @@ where S: Store, word_index: match_.word_index + padding as u16, ..match_.clone() }; - padded_matches.push((*id, match_, *highlight)); + padded_matches.push((*id, match_)); } let mut found = false; @@ -296,7 +295,7 @@ where S: Store, ..match_.clone() }; - for (_, nmatch_, _) in next_group { + for (_, nmatch_) in next_group { let mut rep = query_enhancer.replacement(nmatch_.query_index); let query_index = rep.next().unwrap(); let nmatch_ = TmpMatch { query_index, ..nmatch_.clone() }; @@ -311,12 +310,12 @@ where S: Store, word_index: match_.word_index + padding as u16 + (i + 1) as u16, ..match_.clone() }; - padded_matches.push((*id, match_, *highlight)); + padded_matches.push((*id, match_)); biggest = biggest.max(i + 1); } } - padded_matches.push((*id, padmatch_, *highlight)); + padded_matches.push((*id, padmatch_)); found = true; continue 'padding; } @@ -337,7 +336,7 @@ where S: Store, word_index: match_.word_index + padding as u16 + (i + 1) as u16, ..match_.clone() }; - padded_matches.push((*id, match_, *highlight)); + padded_matches.push((*id, match_)); } biggest = biggest.max(replacement_len); @@ -350,11 +349,19 @@ where S: Store, } - let total_matches = padded_matches.len(); - padded_matches.par_sort_unstable(); - let padded_matches = SetBuf::new_unchecked(padded_matches); - let raw_documents = raw_documents_from_matches(padded_matches); + let matches = { + padded_matches.par_sort_unstable(); + SetBuf::new_unchecked(padded_matches) + }; + + let highlights = { + highlights.par_sort_unstable_by_key(|(id, _)| *id); + SetBuf::new_unchecked(highlights) + }; + + let total_matches = matches.len(); + let raw_documents = raw_documents_from(matches, highlights); info!("{} total documents to classify", raw_documents.len()); info!("{} total matches to classify", total_matches); diff --git a/meilidb-core/src/raw_document.rs b/meilidb-core/src/raw_document.rs index 7a293439e..5d449a74a 100644 --- a/meilidb-core/src/raw_document.rs +++ b/meilidb-core/src/raw_document.rs @@ -66,25 +66,34 @@ impl fmt::Debug for RawDocument { } } -pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec { +pub fn raw_documents_from( + matches: SetBuf<(DocumentId, TmpMatch)>, + highlights: SetBuf<(DocumentId, Highlight)>, +) -> Vec +{ let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); let mut matches2 = Matches::with_capacity(matches.len()); - for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) { - let document_id = group[0].0; - let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); - let end = start + group.len(); + let matches = matches.linear_group_by(|(a, _), (b, _)| a == b); + let highlights = highlights.linear_group_by(|(a, _), (b, _)| a == b); - let highlights = group.iter().map(|(_, _, h)| *h).collect(); + for (mgroup, hgroup) in matches.zip(highlights) { + debug_assert_eq!(mgroup[0].0, hgroup[0].0); + + let document_id = mgroup[0].0; + let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); + let end = start + mgroup.len(); + + let highlights = hgroup.iter().map(|(_, h)| *h).collect(); docs_ranges.push((document_id, Range { start, end }, highlights)); - matches2.extend_from_slice(group); + matches2.extend_from_slice(mgroup); } let matches = Arc::new(matches2); - docs_ranges.into_iter().map(|(i, range, highlights)| { + docs_ranges.into_iter().map(|(id, range, highlights)| { let matches = SharedMatches { range, matches: matches.clone() }; - RawDocument::new(i, matches, highlights) + RawDocument::new(id, matches, highlights) }).collect() } @@ -120,8 +129,8 @@ impl Matches { } } - fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) { - for (_, match_, _) in matches { + fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) { + for (_, match_) in matches { self.query_index.push(match_.query_index); self.distance.push(match_.distance); self.attribute.push(match_.attribute); From bf3c2c372554829e0ef314cd55eb09fb0378c07d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 2 Aug 2019 12:07:23 +0200 Subject: [PATCH 08/19] feat: Move the multi-word rewriting algorithm into its own function --- meilidb-core/Cargo.toml | 2 +- meilidb-core/src/criterion/sum_of_typos.rs | 2 +- meilidb-core/src/query_builder.rs | 222 ++++++++++-------- meilidb-core/src/query_enhancer.rs | 15 +- meilidb-core/src/raw_document.rs | 4 +- .../src/database/synonyms_addition.rs | 4 +- meilidb/examples/create-database.rs | 72 +++++- 7 files changed, 204 insertions(+), 117 deletions(-) diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 037a7788c..25fb57119 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -14,7 +14,7 @@ meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } rayon = "1.0.3" sdset = "0.3.2" serde = { version = "1.0.88", features = ["derive"] } -slice-group-by = "0.2.4" +slice-group-by = "0.2.6" zerocopy = "0.2.2" [dependencies.fst] diff --git a/meilidb-core/src/criterion/sum_of_typos.rs b/meilidb-core/src/criterion/sum_of_typos.rs index d5cd75f08..6736e6caa 100644 --- a/meilidb-core/src/criterion/sum_of_typos.rs +++ b/meilidb-core/src/criterion/sum_of_typos.rs @@ -21,7 +21,7 @@ fn custom_log10(n: u8) -> f32 { #[inline] fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize { - let mut number_words = 0; + let mut number_words: usize = 0; let mut sum_typos = 0.0; let mut index = 0; diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index c5a0ac847..7c3183ff4 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -197,6 +197,110 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI> } } +fn multiword_rewrite_matches( + mut matches: Vec<(DocumentId, TmpMatch)>, + query_enhancer: &QueryEnhancer, +) -> SetBuf<(DocumentId, TmpMatch)> +{ + let mut padded_matches = Vec::with_capacity(matches.len()); + + // we sort the matches by word index to make them rewritable + let start = Instant::now(); + matches.par_sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); + info!("rewrite sort by word_index took {:.2?}", start.elapsed()); + + let start = Instant::now(); + // for each attribute of each document + for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { + + // padding will only be applied + // to word indices in the same attribute + let mut padding = 0; + let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index); + + // for each match at the same position + // in this document attribute + while let Some(same_word_index) = iter.next() { + + // find the biggest padding + let mut biggest = 0; + for (id, match_) in same_word_index { + + let mut replacement = query_enhancer.replacement(match_.query_index); + let replacement_len = replacement.len(); + let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); + + if let Some(query_index) = replacement.next() { + let word_index = match_.word_index + padding as u16; + let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + padded_matches.push((*id, match_)); + } + + let mut found = false; + + // look ahead and if there already is a match + // corresponding to this padding word, abort the padding + 'padding: for (x, next_group) in nexts.enumerate() { + + for (i, query_index) in replacement.clone().enumerate().skip(x) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let padmatch = TmpMatch { query_index, word_index, ..match_.clone() }; + + for (_, nmatch_) in next_group { + let mut rep = query_enhancer.replacement(nmatch_.query_index); + let query_index = rep.next().unwrap(); + if query_index == padmatch.query_index { + + if !found { + // if we find a corresponding padding for the + // first time we must push preceding paddings + for (i, query_index) in replacement.clone().enumerate().take(i) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + padded_matches.push((*id, match_)); + biggest = biggest.max(i + 1); + } + } + + padded_matches.push((*id, padmatch)); + found = true; + continue 'padding; + } + } + } + + // if we do not find a corresponding padding in the + // next groups so stop here and pad what was found + break + } + + if !found { + // if no padding was found in the following matches + // we must insert the entire padding + for (i, query_index) in replacement.enumerate() { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + padded_matches.push((*id, match_)); + } + + biggest = biggest.max(replacement_len - 1); + } + } + + padding += biggest; + } + } + info!("main multiword rewrite took {:.2?}", start.elapsed()); + + let start = Instant::now(); + for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) { + document_matches.sort_unstable(); + } + info!("final rewrite sort took {:.2?}", start.elapsed()); + + SetBuf::new_unchecked(padded_matches) +} + impl<'c, S, FI> QueryBuilder<'c, S, FI> where S: Store, { @@ -217,22 +321,26 @@ where S: Store, let mut matches = Vec::new(); let mut highlights = Vec::new(); + let mut query_db = std::time::Duration::default(); + + let start = Instant::now(); while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { let Automaton { is_exact, query_len, ref dfa } = automatons[iv.index]; let distance = dfa.eval(input).to_u8(); let is_exact = is_exact && distance == 0 && input.len() == query_len; + let start = Instant::now(); let doc_indexes = self.store.word_indexes(input)?; let doc_indexes = match doc_indexes { Some(doc_indexes) => doc_indexes, None => continue, }; + query_db += start.elapsed(); for di in doc_indexes.as_slice() { let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); if let Some(attribute) = attribute { - let match_ = TmpMatch { query_index: iv.index as u32, distance, @@ -253,118 +361,28 @@ where S: Store, } } } + info!("main query all took {:.2?} (get indexes {:.2?})", start.elapsed(), query_db); - // we sort the matches to make them rewritable - matches.par_sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); + info!("{} total matches to rewrite", matches.len()); - let mut padded_matches = Vec::with_capacity(matches.len()); - for same_document in matches.linear_group_by(|a, b| a.0 == b.0) { - - for same_attribute in same_document.linear_group_by(|a, b| a.1.attribute == b.1.attribute) { - - let mut padding = 0; - let mut iter = same_attribute.linear_group_by(|a, b| a.1.word_index == b.1.word_index); - while let Some(same_word_index) = iter.next() { - - let mut biggest = 0; - for (id, match_) in same_word_index { - - let mut replacement = query_enhancer.replacement(match_.query_index); - let replacement_len = replacement.len() - 1; - let nexts = iter.remainder().linear_group_by(|a, b| a.1.word_index == b.1.word_index); - - if let Some(query_index) = replacement.next() { - let match_ = TmpMatch { - query_index, - word_index: match_.word_index + padding as u16, - ..match_.clone() - }; - padded_matches.push((*id, match_)); - } - - let mut found = false; - - // look ahead and if there already is a match - // corresponding to this padding word, abort the padding - 'padding: for (x, next_group) in nexts.enumerate() { - - for (i, query_index) in replacement.clone().enumerate().skip(x) { - let padmatch_ = TmpMatch { - query_index, - word_index: match_.word_index + padding as u16 + (i + 1) as u16, - ..match_.clone() - }; - - for (_, nmatch_) in next_group { - let mut rep = query_enhancer.replacement(nmatch_.query_index); - let query_index = rep.next().unwrap(); - let nmatch_ = TmpMatch { query_index, ..nmatch_.clone() }; - if nmatch_.query_index == padmatch_.query_index { - - if !found { - // if we find a corresponding padding for the - // first time we must push preceding paddings - for (i, query_index) in replacement.clone().enumerate().take(i) { - let match_ = TmpMatch { - query_index, - word_index: match_.word_index + padding as u16 + (i + 1) as u16, - ..match_.clone() - }; - padded_matches.push((*id, match_)); - biggest = biggest.max(i + 1); - } - } - - padded_matches.push((*id, padmatch_)); - found = true; - continue 'padding; - } - } - } - - // if we do not find a corresponding padding in the - // next groups so stop here and pad what was found - break - } - - if !found { - // if no padding was found in the following matches - // we must insert the entire padding - for (i, query_index) in replacement.enumerate() { - let match_ = TmpMatch { - query_index, - word_index: match_.word_index + padding as u16 + (i + 1) as u16, - ..match_.clone() - }; - padded_matches.push((*id, match_)); - } - - biggest = biggest.max(replacement_len); - } - } - - padding += biggest; - } - } - - } - - - let matches = { - padded_matches.par_sort_unstable(); - SetBuf::new_unchecked(padded_matches) - }; + let start = Instant::now(); + let matches = multiword_rewrite_matches(matches, &query_enhancer); + info!("multiword rewrite took {:.2?}", start.elapsed()); + let start = Instant::now(); let highlights = { highlights.par_sort_unstable_by_key(|(id, _)| *id); SetBuf::new_unchecked(highlights) }; + info!("sorting highlights took {:.2?}", start.elapsed()); - let total_matches = matches.len(); + info!("{} total matches to classify", matches.len()); + + let start = Instant::now(); let raw_documents = raw_documents_from(matches, highlights); + info!("making raw documents took {:.2?}", start.elapsed()); info!("{} total documents to classify", raw_documents.len()); - info!("{} total matches to classify", total_matches); Ok(raw_documents) } diff --git a/meilidb-core/src/query_enhancer.rs b/meilidb-core/src/query_enhancer.rs index 6280ae11e..165c1b094 100644 --- a/meilidb-core/src/query_enhancer.rs +++ b/meilidb-core/src/query_enhancer.rs @@ -52,17 +52,20 @@ where S: AsRef, !original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref)) } +type Origin = usize; +type RealLength = usize; + struct FakeIntervalTree { - intervals: Vec<(Range, (usize, usize))>, // origin, real_length + intervals: Vec<(Range, (Origin, RealLength))>, } impl FakeIntervalTree { - fn new(mut intervals: Vec<(Range, (usize, usize))>) -> FakeIntervalTree { + fn new(mut intervals: Vec<(Range, (Origin, RealLength))>) -> FakeIntervalTree { intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end)); FakeIntervalTree { intervals } } - fn query(&self, point: usize) -> Option<(Range, (usize, usize))> { + fn query(&self, point: usize) -> Option<(Range, (Origin, RealLength))> { let element = self.intervals.binary_search_by(|(r, _)| { if point >= r.start { if point < r.end { Equal } else { Less } @@ -81,7 +84,7 @@ impl FakeIntervalTree { pub struct QueryEnhancerBuilder<'a, S> { query: &'a [S], origins: Vec, - real_to_origin: Vec<(Range, (usize, usize))>, + real_to_origin: Vec<(Range, (Origin, RealLength))>, } impl> QueryEnhancerBuilder<'_, S> { @@ -147,8 +150,8 @@ impl QueryEnhancer { // query the fake interval tree with the real query index let (range, (origin, real_length)) = self.real_to_origin - .query(real) - .expect("real has never been declared"); + .query(real) + .expect("real has never been declared"); // if `real` is the end bound of the range if (range.start + real_length - 1) == real { diff --git a/meilidb-core/src/raw_document.rs b/meilidb-core/src/raw_document.rs index 5d449a74a..3567c3fd1 100644 --- a/meilidb-core/src/raw_document.rs +++ b/meilidb-core/src/raw_document.rs @@ -74,8 +74,8 @@ pub fn raw_documents_from( let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); let mut matches2 = Matches::with_capacity(matches.len()); - let matches = matches.linear_group_by(|(a, _), (b, _)| a == b); - let highlights = highlights.linear_group_by(|(a, _), (b, _)| a == b); + let matches = matches.linear_group_by_key(|(id, _)| *id); + let highlights = highlights.linear_group_by_key(|(id, _)| *id); for (mgroup, hgroup) in matches.zip(highlights) { debug_assert_eq!(mgroup[0].0, hgroup[0].0); diff --git a/meilidb-data/src/database/synonyms_addition.rs b/meilidb-data/src/database/synonyms_addition.rs index 6e16ab97b..c37f0475a 100644 --- a/meilidb-data/src/database/synonyms_addition.rs +++ b/meilidb-data/src/database/synonyms_addition.rs @@ -21,10 +21,10 @@ impl<'a> SynonymsAddition<'a> { pub fn add_synonym(&mut self, synonym: S, alternatives: I) where S: AsRef, T: AsRef, - I: Iterator, + I: IntoIterator, { let synonym = normalize_str(synonym.as_ref()); - let alternatives = alternatives.map(|s| s.as_ref().to_lowercase()); + let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase()); self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); } diff --git a/meilidb/examples/create-database.rs b/meilidb/examples/create-database.rs index ed07e3742..d8e553ed3 100644 --- a/meilidb/examples/create-database.rs +++ b/meilidb/examples/create-database.rs @@ -31,9 +31,13 @@ pub struct Opt { #[structopt(long = "schema", parse(from_os_str))] pub schema_path: PathBuf, + /// The file with the synonyms. + #[structopt(long = "synonyms", parse(from_os_str))] + pub synonyms: Option, + /// The path to the list of stop words (one by line). #[structopt(long = "stop-words", parse(from_os_str))] - pub stop_words_path: Option, + pub stop_words: Option, #[structopt(long = "update-group-size")] pub update_group_size: Option, @@ -45,12 +49,40 @@ struct Document<'a> ( HashMap, Cow<'a, str>> ); +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum Synonym { + OneWay(SynonymOneWay), + MultiWay { synonyms: Vec }, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SynonymOneWay { + pub search_terms: String, + pub synonyms: Synonyms, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum Synonyms { + Multiple(Vec), + Single(String), +} + +fn read_synomys(path: &Path) -> Result, Box> { + let file = File::open(path)?; + let synonyms = serde_json::from_reader(file)?; + Ok(synonyms) +} + fn index( schema: Schema, database_path: &Path, csv_data_path: &Path, update_group_size: Option, stop_words: &HashSet, + synonyms: Vec, ) -> Result> { let database = Database::start_default(database_path)?; @@ -62,6 +94,28 @@ fn index( let index = database.create_index("test", schema.clone())?; + let mut synonyms_adder = index.synonyms_addition(); + for synonym in synonyms { + match synonym { + Synonym::OneWay(SynonymOneWay { search_terms, synonyms }) => { + let alternatives = match synonyms { + Synonyms::Multiple(alternatives) => alternatives, + Synonyms::Single(alternative) => vec![alternative], + }; + synonyms_adder.add_synonym(search_terms, alternatives); + }, + Synonym::MultiWay { mut synonyms } => { + for _ in 0..synonyms.len() { + if let Some((synonym, alternatives)) = synonyms.split_first() { + synonyms_adder.add_synonym(synonym, alternatives); + } + synonyms.rotate_left(1); + } + }, + } + } + synonyms_adder.finalize()?; + let mut rdr = csv::Reader::from_path(csv_data_path)?; let mut raw_record = csv::StringRecord::new(); let headers = rdr.headers()?.clone(); @@ -133,13 +187,25 @@ fn main() -> Result<(), Box> { Schema::from_toml(file)? }; - let stop_words = match opt.stop_words_path { + let stop_words = match opt.stop_words { Some(ref path) => retrieve_stop_words(path)?, None => HashSet::new(), }; + let synonyms = match opt.synonyms { + Some(ref path) => read_synomys(path)?, + None => Vec::new(), + }; + let start = Instant::now(); - let result = index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words); + let result = index( + schema, + &opt.database_path, + &opt.csv_data_path, + opt.update_group_size, + &stop_words, + synonyms, + ); if let Err(e) = result { return Err(e.into()) From a488c00a2e4daf0f9d4947b4e06fb66b92b7d733 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 26 Jul 2019 13:27:38 +0200 Subject: [PATCH 09/19] feat: Use RustyLine in the query-database example --- meilidb/Cargo.toml | 1 + meilidb/examples/query-database.rs | 114 +++++++++++++++-------------- 2 files changed, 60 insertions(+), 55 deletions(-) diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index 8ba89f212..7208067f0 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -18,6 +18,7 @@ meilidb-core = { path = "../meilidb-core", version = "0.1.0" } quickcheck = "0.8.2" rand = "0.6.5" rand_xorshift = "0.1.1" +rustyline = { version = "5.0.0", default-features = false } serde = { version = "1.0.91" , features = ["derive"] } serde_json = "1.0.39" structopt = "0.2.15" diff --git a/meilidb/examples/query-database.rs b/meilidb/examples/query-database.rs index 72244d1b8..f9e2f8389 100644 --- a/meilidb/examples/query-database.rs +++ b/meilidb/examples/query-database.rs @@ -3,16 +3,17 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::{HashMap, HashSet}; -use std::iter::FromIterator; -use std::io::{self, Write}; -use std::time::{Instant, Duration}; -use std::path::PathBuf; use std::error::Error; +use std::io::{self, Write}; +use std::iter::FromIterator; +use std::path::PathBuf; +use std::time::{Instant, Duration}; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use structopt::StructOpt; -use meilidb_core::Highlight; +use rustyline::{Editor, Config}; +use meilidb_core::Highlight; use meilidb_data::Database; use meilidb_schema::SchemaAttr; @@ -140,9 +141,6 @@ fn main() -> Result<(), Box> { let start = Instant::now(); let database = Database::start_default(&opt.database_path)?; - let mut buffer = String::new(); - let input = io::stdin(); - let index = database.open_index("test")?.unwrap(); let schema = index.schema(); @@ -151,65 +149,71 @@ fn main() -> Result<(), Box> { let fields = opt.displayed_fields.iter().map(String::as_str); let fields = HashSet::from_iter(fields); - loop { - print!("Searching for: "); - io::stdout().flush()?; + let config = Config::builder().auto_add_history(true).build(); + let mut readline = Editor::<()>::with_config(config); + let _ = readline.load_history("query-history.txt"); - if input.read_line(&mut buffer)? == 0 { break } - let query = buffer.trim_end_matches('\n'); + for result in readline.iter("Searching for: ") { + match result { + Ok(query) => { + let start_total = Instant::now(); - let start_total = Instant::now(); + let builder = index.query_builder(); + let documents = builder.query(&query, 0..opt.number_results)?; - let builder = index.query_builder(); - let documents = builder.query(query, 0..opt.number_results)?; + let mut retrieve_duration = Duration::default(); - let mut retrieve_duration = Duration::default(); + let number_of_documents = documents.len(); + for mut doc in documents { - let number_of_documents = documents.len(); - for mut doc in documents { + doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length)); - doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length)); + let start_retrieve = Instant::now(); + let result = index.document::(Some(&fields), doc.id); + retrieve_duration += start_retrieve.elapsed(); - let start_retrieve = Instant::now(); - let result = index.document::(Some(&fields), doc.id); - retrieve_duration += start_retrieve.elapsed(); + match result { + Ok(Some(document)) => { + for (name, text) in document { + print!("{}: ", name); - match result { - Ok(Some(document)) => { - for (name, text) in document { - print!("{}: ", name); - - let attr = schema.attribute(&name).unwrap(); - let highlights = doc.highlights.iter() - .filter(|m| SchemaAttr::new(m.attribute) == attr) - .cloned(); - let (text, highlights) = crop_text(&text, highlights, opt.char_context); - let areas = create_highlight_areas(&text, &highlights); - display_highlights(&text, &areas)?; - println!(); + let attr = schema.attribute(&name).unwrap(); + let highlights = doc.highlights.iter() + .filter(|m| SchemaAttr::new(m.attribute) == attr) + .cloned(); + let (text, highlights) = crop_text(&text, highlights, opt.char_context); + let areas = create_highlight_areas(&text, &highlights); + display_highlights(&text, &areas)?; + println!(); + } + }, + Ok(None) => eprintln!("missing document"), + Err(e) => eprintln!("{}", e), } - }, - Ok(None) => eprintln!("missing document"), - Err(e) => eprintln!("{}", e), + + let mut matching_attributes = HashSet::new(); + for highlight in doc.highlights { + let attr = SchemaAttr::new(highlight.attribute); + let name = schema.attribute_name(attr); + matching_attributes.insert(name); + } + + let matching_attributes = Vec::from_iter(matching_attributes); + println!("matching in: {:?}", matching_attributes); + + println!(); + } + + eprintln!("document field retrieve took {:.2?}", retrieve_duration); + eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed()); + }, + Err(err) => { + println!("Error: {:?}", err); + break } - - let mut matching_attributes = HashSet::new(); - for highlight in doc.highlights { - let attr = SchemaAttr::new(highlight.attribute); - let name = schema.attribute_name(attr); - matching_attributes.insert(name); - } - - let matching_attributes = Vec::from_iter(matching_attributes); - println!("matching in: {:?}", matching_attributes); - - println!(); } - - eprintln!("document field retrieve took {:.2?}", retrieve_duration); - eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed()); - buffer.clear(); } + readline.save_history("query-history.txt").unwrap(); Ok(()) } From ebc95cb8f279ac286c40fc4821b8ce1900bbef5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 30 Jul 2019 15:15:47 +0200 Subject: [PATCH 10/19] feat: Display the documents fields in the order they were declared --- meilidb/Cargo.toml | 1 + meilidb/examples/query-database.rs | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index 7208067f0..0eecba0a1 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -14,6 +14,7 @@ csv = "1.0.7" diskus = "0.5.0" env_logger = "0.6.1" jemallocator = "0.1.9" +linked-hash-map = "0.5.2" meilidb-core = { path = "../meilidb-core", version = "0.1.0" } quickcheck = "0.8.2" rand = "0.6.5" diff --git a/meilidb/examples/query-database.rs b/meilidb/examples/query-database.rs index f9e2f8389..e6368727a 100644 --- a/meilidb/examples/query-database.rs +++ b/meilidb/examples/query-database.rs @@ -2,16 +2,17 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; use std::collections::btree_map::{BTreeMap, Entry}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::error::Error; use std::io::{self, Write}; use std::iter::FromIterator; use std::path::PathBuf; use std::time::{Instant, Duration}; -use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; -use structopt::StructOpt; +use linked_hash_map::LinkedHashMap; use rustyline::{Editor, Config}; +use structopt::StructOpt; +use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use meilidb_core::Highlight; use meilidb_data::Database; @@ -35,7 +36,7 @@ pub struct Opt { pub char_context: usize, } -type Document = HashMap; +type Document = LinkedHashMap; fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> { let mut stdout = StandardStream::stdout(ColorChoice::Always); From 81d44a0854f850cce15623e5afe7498ac86012c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 16 Aug 2019 12:17:23 +0200 Subject: [PATCH 11/19] feat: Order automatons by importance --- meilidb-core/src/query_builder.rs | 35 +++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 7c3183ff4..d88e293b7 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -2,7 +2,7 @@ use std::hash::Hash; use std::ops::Range; use std::rc::Rc; use std::time::Instant; -use std::{cmp, mem}; +use std::{mem, cmp, cmp::Reverse}; use fst::{Streamer, IntoStreamer}; use hashbrown::HashMap; @@ -24,30 +24,38 @@ use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document}; const NGRAMS: usize = 3; struct Automaton { + index: usize, + ngram: usize, query_len: usize, is_exact: bool, dfa: DFA, } impl Automaton { - fn exact(query: &str) -> Automaton { + fn exact(index: usize, ngram: usize, query: &str) -> Automaton { Automaton { + index, + ngram, query_len: query.len(), is_exact: true, dfa: build_dfa(query), } } - fn prefix_exact(query: &str) -> Automaton { + fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton { Automaton { + index, + ngram, query_len: query.len(), is_exact: true, dfa: build_prefix_dfa(query), } } - fn non_exact(query: &str) -> Automaton { + fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton { Automaton { + index, + ngram, query_len: query.len(), is_exact: false, dfa: build_dfa(query), @@ -82,9 +90,9 @@ fn generate_automatons(query: &str, store: &S) -> Result<(Vec(query: &str, store: &S) -> Result<(Vec(query: &str, store: &S) -> Result<(Vec Date: Fri, 16 Aug 2019 12:25:35 +0200 Subject: [PATCH 12/19] feat: Process automatons in the order they were sort --- meilidb-core/src/query_builder.rs | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index d88e293b7..636248c36 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -319,27 +319,20 @@ where S: Store, { fn query_all(&self, query: &str) -> Result, S::Error> { let (automatons, query_enhancer) = generate_automatons(query, &self.store)?; - let words = self.store.words()?.as_fst(); + let words = self.store.words()?; let searchables = self.searchable_attrs.as_ref(); - let mut stream = { - let mut op_builder = fst::raw::OpBuilder::new(); - for Automaton { dfa, .. } in &automatons { - let stream = words.search(dfa); - op_builder.push(stream); - } - op_builder.r#union() - }; - let mut matches = Vec::new(); let mut highlights = Vec::new(); let mut query_db = std::time::Duration::default(); - let start = Instant::now(); - while let Some((input, indexed_values)) = stream.next() { - for iv in indexed_values { - let Automaton { index, is_exact, query_len, ref dfa, .. } = automatons[iv.index]; + + for automaton in automatons { + let Automaton { index, is_exact, query_len, dfa, .. } = automaton; + let mut stream = words.search(&dfa).into_stream(); + + while let Some(input) = stream.next() { let distance = dfa.eval(input).to_u8(); let is_exact = is_exact && distance == 0 && input.len() == query_len; @@ -374,8 +367,8 @@ where S: Store, } } } - info!("main query all took {:.2?} (get indexes {:.2?})", start.elapsed(), query_db); + info!("main query all took {:.2?} (get indexes {:.2?})", start.elapsed(), query_db); info!("{} total matches to rewrite", matches.len()); let start = Instant::now(); From d9c9fafd78ed4a6754a6386975e8ebae84177777 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 16 Aug 2019 15:01:25 +0200 Subject: [PATCH 13/19] feat: Fetch doc indexes while there is time --- meilidb-core/src/query_builder.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 636248c36..76e47e1ab 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -1,7 +1,7 @@ use std::hash::Hash; use std::ops::Range; use std::rc::Rc; -use std::time::Instant; +use std::time::{Instant, Duration}; use std::{mem, cmp, cmp::Reverse}; use fst::{Streamer, IntoStreamer}; @@ -325,10 +325,11 @@ where S: Store, let mut matches = Vec::new(); let mut highlights = Vec::new(); - let mut query_db = std::time::Duration::default(); + let fetching_end_time = Instant::now() + Duration::from_millis(30); + let mut query_db = Duration::default(); let start = Instant::now(); - for automaton in automatons { + 'automatons: for automaton in automatons { let Automaton { index, is_exact, query_len, dfa, .. } = automaton; let mut stream = words.search(&dfa).into_stream(); @@ -345,6 +346,11 @@ where S: Store, query_db += start.elapsed(); for di in doc_indexes.as_slice() { + + if Instant::now() > fetching_end_time { + break 'automatons + } + let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); if let Some(attribute) = attribute { let match_ = TmpMatch { From b7b60b5fe5775c689985cb88382d0905fb1f1a44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 16 Aug 2019 16:35:19 +0200 Subject: [PATCH 14/19] feat: Introduce a new thread to avoid waiting on doc indexes fetchs --- meilidb-core/Cargo.toml | 1 + meilidb-core/src/lib.rs | 2 + meilidb-core/src/query_builder.rs | 140 ++++++++++++++++++---------- meilidb-core/src/reordered_attrs.rs | 2 +- 4 files changed, 93 insertions(+), 52 deletions(-) diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 25fb57119..29d2e61ef 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" [dependencies] byteorder = "1.3.1" +crossbeam-channel = "0.3.9" deunicode = "1.0.0" hashbrown = "0.2.2" lazy_static = "1.2.0" diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 6f6e46359..0a7844292 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -1,3 +1,5 @@ +#![feature(checked_duration_since)] + #[cfg(test)] #[macro_use] extern crate assert_matches; diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 76e47e1ab..97a750d18 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -2,14 +2,15 @@ use std::hash::Hash; use std::ops::Range; use std::rc::Rc; use std::time::{Instant, Duration}; -use std::{mem, cmp, cmp::Reverse}; +use std::{iter, mem, cmp, cmp::Reverse}; use fst::{Streamer, IntoStreamer}; use hashbrown::HashMap; use levenshtein_automata::DFA; -use log::info; +use log::{info, error}; use meilidb_tokenizer::{is_cjk, split_query_string}; use rayon::slice::ParallelSliceMut; +use rayon::iter::{ParallelIterator, ParallelBridge}; use sdset::SetBuf; use slice_group_by::{GroupBy, GroupByMut}; @@ -315,66 +316,101 @@ fn multiword_rewrite_matches( } impl<'c, S, FI> QueryBuilder<'c, S, FI> -where S: Store, +where S: 'static + Store + Send + Clone, + S::Error: Send, { fn query_all(&self, query: &str) -> Result, S::Error> { let (automatons, query_enhancer) = generate_automatons(query, &self.store)?; - let words = self.store.words()?; - let searchables = self.searchable_attrs.as_ref(); + let searchables = self.searchable_attrs.clone(); + let store = self.store.clone(); let mut matches = Vec::new(); let mut highlights = Vec::new(); - let fetching_end_time = Instant::now() + Duration::from_millis(30); - let mut query_db = Duration::default(); + let recv_end_time = Instant::now() + Duration::from_millis(30); let start = Instant::now(); - 'automatons: for automaton in automatons { - let Automaton { index, is_exact, query_len, dfa, .. } = automaton; - let mut stream = words.search(&dfa).into_stream(); + let (sender, receiver) = crossbeam_channel::bounded(10); - while let Some(input) = stream.next() { - let distance = dfa.eval(input).to_u8(); - let is_exact = is_exact && distance == 0 && input.len() == query_len; - - let start = Instant::now(); - let doc_indexes = self.store.word_indexes(input)?; - let doc_indexes = match doc_indexes { - Some(doc_indexes) => doc_indexes, - None => continue, - }; - query_db += start.elapsed(); - - for di in doc_indexes.as_slice() { - - if Instant::now() > fetching_end_time { - break 'automatons - } - - let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); - if let Some(attribute) = attribute { - let match_ = TmpMatch { - query_index: index as u32, - distance, - attribute, - word_index: di.word_index, - is_exact, - }; - - let highlight = Highlight { - attribute: di.attribute, - char_index: di.char_index, - char_length: di.char_length, - }; - - matches.push((di.document_id, match_)); - highlights.push((di.document_id, highlight)); - } - } + rayon::spawn(move || { + enum Error { + SendError, + StoreError(E), } + + let result = automatons + .into_iter() + .par_bridge() + .try_for_each_with((sender, store, searchables.as_ref()), |data, automaton| { + let (sender, store, searchables) = data; + let Automaton { index, is_exact, query_len, dfa, .. } = automaton; + + let words = store.words().map_err(Error::StoreError)?; + let mut stream = words.search(&dfa).into_stream(); + + let mut matches = Vec::new(); + let mut highlights = Vec::new(); + + while let Some(input) = stream.next() { + let distance = dfa.eval(input).to_u8(); + let is_exact = is_exact && distance == 0 && input.len() == query_len; + + let doc_indexes = store.word_indexes(input).map_err(Error::StoreError)?; + let doc_indexes = match doc_indexes { + Some(doc_indexes) => doc_indexes, + None => continue, + }; + + matches.reserve(doc_indexes.len()); + highlights.reserve(doc_indexes.len()); + + for di in doc_indexes.as_slice() { + + let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); + if let Some(attribute) = attribute { + let match_ = TmpMatch { + query_index: index as u32, + distance, + attribute, + word_index: di.word_index, + is_exact, + }; + + let highlight = Highlight { + attribute: di.attribute, + char_index: di.char_index, + char_length: di.char_length, + }; + + matches.push((di.document_id, match_)); + highlights.push((di.document_id, highlight)); + } + } + } + + sender.send((matches, highlights)).map_err(|_| Error::SendError) + }); + + if let Err(Error::StoreError(e)) = result { + error!("{}", e); + } + }); + + let iter = receiver.recv().into_iter().chain(iter::from_fn(|| { + match recv_end_time.checked_duration_since(Instant::now()) { + Some(timeout) => receiver.recv_timeout(timeout).ok(), + None => None, + } + })); + + for (mut rcv_matches, mut rcv_highlights) in iter { + matches.append(&mut rcv_matches); + highlights.append(&mut rcv_highlights); } - info!("main query all took {:.2?} (get indexes {:.2?})", start.elapsed(), query_db); + drop(receiver); + + info!("main query all took {:.2?}", start.elapsed()); info!("{} total matches to rewrite", matches.len()); let start = Instant::now(); @@ -401,7 +437,8 @@ where S: Store, } impl<'c, S, FI> QueryBuilder<'c, S, FI> -where S: Store, +where S: 'static + Store + Send + Clone, + S::Error: Send, FI: Fn(DocumentId) -> bool, { pub fn query(self, query: &str, range: Range) -> Result, S::Error> { @@ -478,7 +515,8 @@ impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> } impl<'c, S, FI, FD, K> DistinctQueryBuilder<'c, S, FI, FD> -where S: Store, +where S: 'static + Store + Send + Clone, + S::Error: Send, FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, K: Hash + Eq, diff --git a/meilidb-core/src/reordered_attrs.rs b/meilidb-core/src/reordered_attrs.rs index ad7b2c324..ed11045ab 100644 --- a/meilidb-core/src/reordered_attrs.rs +++ b/meilidb-core/src/reordered_attrs.rs @@ -1,4 +1,4 @@ -#[derive(Default)] +#[derive(Default, Clone)] pub struct ReorderedAttrs { count: usize, reorders: Vec>, From 0ee56314fb1fc82d99341d85bbfcb2c9a1e80502 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 19 Aug 2019 11:10:54 +0200 Subject: [PATCH 15/19] feat: Try to simplify Store trait bound with a rayon scope --- meilidb-core/src/query_builder.rs | 168 +++++++++++++++--------------- 1 file changed, 85 insertions(+), 83 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 97a750d18..b436f8604 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -316,128 +316,130 @@ fn multiword_rewrite_matches( } impl<'c, S, FI> QueryBuilder<'c, S, FI> -where S: 'static + Store + Send + Clone, +where S: Store + Sync, S::Error: Send, { fn query_all(&self, query: &str) -> Result, S::Error> { let (automatons, query_enhancer) = generate_automatons(query, &self.store)?; - let searchables = self.searchable_attrs.clone(); - let store = self.store.clone(); + let searchables = self.searchable_attrs.as_ref(); + let store = &self.store; - let mut matches = Vec::new(); - let mut highlights = Vec::new(); - - let recv_end_time = Instant::now() + Duration::from_millis(30); - let start = Instant::now(); - - let (sender, receiver) = crossbeam_channel::bounded(10); - - rayon::spawn(move || { + rayon::scope(move |s| { enum Error { SendError, StoreError(E), } - let result = automatons - .into_iter() - .par_bridge() - .try_for_each_with((sender, store, searchables.as_ref()), |data, automaton| { - let (sender, store, searchables) = data; - let Automaton { index, is_exact, query_len, dfa, .. } = automaton; + let mut matches = Vec::new(); + let mut highlights = Vec::new(); - let words = store.words().map_err(Error::StoreError)?; - let mut stream = words.search(&dfa).into_stream(); + let recv_end_time = Instant::now() + Duration::from_millis(30); + let start = Instant::now(); - let mut matches = Vec::new(); - let mut highlights = Vec::new(); + let (sender, receiver) = crossbeam_channel::bounded(10); - while let Some(input) = stream.next() { - let distance = dfa.eval(input).to_u8(); - let is_exact = is_exact && distance == 0 && input.len() == query_len; + s.spawn(move |_| { + let result = automatons + .into_iter() + .par_bridge() + .try_for_each_with((sender, store, searchables), |data, automaton| { + let (sender, store, searchables) = data; + let Automaton { index, is_exact, query_len, dfa, .. } = automaton; - let doc_indexes = store.word_indexes(input).map_err(Error::StoreError)?; - let doc_indexes = match doc_indexes { - Some(doc_indexes) => doc_indexes, - None => continue, - }; + let words = store.words().map_err(Error::StoreError)?; + let mut stream = words.search(&dfa).into_stream(); - matches.reserve(doc_indexes.len()); - highlights.reserve(doc_indexes.len()); + let mut matches = Vec::new(); + let mut highlights = Vec::new(); - for di in doc_indexes.as_slice() { + while let Some(input) = stream.next() { + let distance = dfa.eval(input).to_u8(); + let is_exact = is_exact && distance == 0 && input.len() == query_len; - let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); - if let Some(attribute) = attribute { - let match_ = TmpMatch { - query_index: index as u32, - distance, - attribute, - word_index: di.word_index, - is_exact, - }; + let doc_indexes = store.word_indexes(input).map_err(Error::StoreError)?; + let doc_indexes = match doc_indexes { + Some(doc_indexes) => doc_indexes, + None => continue, + }; - let highlight = Highlight { - attribute: di.attribute, - char_index: di.char_index, - char_length: di.char_length, - }; + matches.reserve(doc_indexes.len()); + highlights.reserve(doc_indexes.len()); - matches.push((di.document_id, match_)); - highlights.push((di.document_id, highlight)); + for di in doc_indexes.as_slice() { + + let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); + if let Some(attribute) = attribute { + let match_ = TmpMatch { + query_index: index as u32, + distance, + attribute, + word_index: di.word_index, + is_exact, + }; + + let highlight = Highlight { + attribute: di.attribute, + char_index: di.char_index, + char_length: di.char_length, + }; + + matches.push((di.document_id, match_)); + highlights.push((di.document_id, highlight)); + } } } - } - sender.send((matches, highlights)).map_err(|_| Error::SendError) - }); + sender.send((matches, highlights)).map_err(|_| Error::SendError) + }); if let Err(Error::StoreError(e)) = result { error!("{}", e); } - }); + }); - let iter = receiver.recv().into_iter().chain(iter::from_fn(|| { - match recv_end_time.checked_duration_since(Instant::now()) { - Some(timeout) => receiver.recv_timeout(timeout).ok(), - None => None, + let iter = receiver.recv().into_iter().chain(iter::from_fn(|| { + match recv_end_time.checked_duration_since(Instant::now()) { + Some(timeout) => receiver.recv_timeout(timeout).ok(), + None => None, + } + })); + + for (mut rcv_matches, mut rcv_highlights) in iter { + matches.append(&mut rcv_matches); + highlights.append(&mut rcv_highlights); } - })); - for (mut rcv_matches, mut rcv_highlights) in iter { - matches.append(&mut rcv_matches); - highlights.append(&mut rcv_highlights); - } + drop(receiver); - drop(receiver); + info!("main query all took {:.2?}", start.elapsed()); + info!("{} total matches to rewrite", matches.len()); - info!("main query all took {:.2?}", start.elapsed()); - info!("{} total matches to rewrite", matches.len()); + let start = Instant::now(); + let matches = multiword_rewrite_matches(matches, &query_enhancer); + info!("multiword rewrite took {:.2?}", start.elapsed()); - let start = Instant::now(); - let matches = multiword_rewrite_matches(matches, &query_enhancer); - info!("multiword rewrite took {:.2?}", start.elapsed()); + let start = Instant::now(); + let highlights = { + highlights.par_sort_unstable_by_key(|(id, _)| *id); + SetBuf::new_unchecked(highlights) + }; + info!("sorting highlights took {:.2?}", start.elapsed()); - let start = Instant::now(); - let highlights = { - highlights.par_sort_unstable_by_key(|(id, _)| *id); - SetBuf::new_unchecked(highlights) - }; - info!("sorting highlights took {:.2?}", start.elapsed()); + info!("{} total matches to classify", matches.len()); - info!("{} total matches to classify", matches.len()); + let start = Instant::now(); + let raw_documents = raw_documents_from(matches, highlights); + info!("making raw documents took {:.2?}", start.elapsed()); - let start = Instant::now(); - let raw_documents = raw_documents_from(matches, highlights); - info!("making raw documents took {:.2?}", start.elapsed()); + info!("{} total documents to classify", raw_documents.len()); - info!("{} total documents to classify", raw_documents.len()); - - Ok(raw_documents) + Ok(raw_documents) + }) } } impl<'c, S, FI> QueryBuilder<'c, S, FI> -where S: 'static + Store + Send + Clone, +where S: Store + Sync, S::Error: Send, FI: Fn(DocumentId) -> bool, { @@ -515,7 +517,7 @@ impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> } impl<'c, S, FI, FD, K> DistinctQueryBuilder<'c, S, FI, FD> -where S: 'static + Store + Send + Clone, +where S: Store + Sync, S::Error: Send, FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, From 7dc9ea78fab427202e6b613366af0b6eea79cbda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 18 Aug 2019 18:57:41 +0200 Subject: [PATCH 16/19] feat: Make the automaton DFA construction lazy --- meilidb-core/src/query_builder.rs | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index b436f8604..2da52189b 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -29,17 +29,27 @@ struct Automaton { ngram: usize, query_len: usize, is_exact: bool, - dfa: DFA, + is_prefix: bool, + query: String, } impl Automaton { + fn dfa(&self) -> DFA { + if self.is_prefix { + build_prefix_dfa(&self.query) + } else { + build_dfa(&self.query) + } + } + fn exact(index: usize, ngram: usize, query: &str) -> Automaton { Automaton { index, ngram, query_len: query.len(), is_exact: true, - dfa: build_dfa(query), + is_prefix: false, + query: query.to_string(), } } @@ -49,7 +59,8 @@ impl Automaton { ngram, query_len: query.len(), is_exact: true, - dfa: build_prefix_dfa(query), + is_prefix: true, + query: query.to_string(), } } @@ -59,7 +70,8 @@ impl Automaton { ngram, query_len: query.len(), is_exact: false, - dfa: build_dfa(query), + is_prefix: false, + query: query.to_string(), } } } @@ -344,7 +356,8 @@ where S: Store + Sync, .par_bridge() .try_for_each_with((sender, store, searchables), |data, automaton| { let (sender, store, searchables) = data; - let Automaton { index, is_exact, query_len, dfa, .. } = automaton; + let Automaton { index, is_exact, query_len, .. } = automaton; + let dfa = automaton.dfa(); let words = store.words().map_err(Error::StoreError)?; let mut stream = words.search(&dfa).into_stream(); From 67302d09f34a2f1c2efea4b304aa0970a5ab64aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 18 Aug 2019 18:58:38 +0200 Subject: [PATCH 17/19] feat: Multiword rewrite while there is time --- meilidb-core/src/query_builder.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 2da52189b..eae3e4bba 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -239,6 +239,12 @@ fn multiword_rewrite_matches( // for each attribute of each document for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { + let elapsed = start.elapsed(); + if elapsed > Duration::from_millis(10) { + info!("abort multiword rewrite after {:.2?}", elapsed); + break; + } + // padding will only be applied // to word indices in the same attribute let mut padding = 0; From 9c5ec110e5d1c5b3fbe7775efbb0a7b46eec5ed4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 28 Aug 2019 13:23:03 +0200 Subject: [PATCH 18/19] feat: Introduce a way to enable or disable query timeouts --- meilidb-core/src/query_builder.rs | 34 +++++++++++++++---- .../src/database/synonyms_addition.rs | 2 +- meilidb/examples/query-database.rs | 2 +- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index eae3e4bba..5847f9f1b 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -185,6 +185,7 @@ pub struct QueryBuilder<'c, S, FI = fn(DocumentId) -> bool> { criteria: Criteria<'c>, searchable_attrs: Option, filter: Option, + fetch_timeout: Option, } impl<'c, S> QueryBuilder<'c, S, fn(DocumentId) -> bool> { @@ -193,7 +194,7 @@ impl<'c, S> QueryBuilder<'c, S, fn(DocumentId) -> bool> { } pub fn with_criteria(store: S, criteria: Criteria<'c>) -> Self { - QueryBuilder { store, criteria, searchable_attrs: None, filter: None } + QueryBuilder { store, criteria, searchable_attrs: None, filter: None, fetch_timeout: None } } } @@ -207,9 +208,14 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI> criteria: self.criteria, searchable_attrs: self.searchable_attrs, filter: Some(function), + fetch_timeout: self.fetch_timeout, } } + pub fn with_fetch_timeout(self, timeout: Duration) -> QueryBuilder<'c, S, FI> { + QueryBuilder { fetch_timeout: Some(timeout), ..self } + } + pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder<'c, S, FI, F> where F: Fn(DocumentId) -> Option, K: Hash + Eq, @@ -226,6 +232,7 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI> fn multiword_rewrite_matches( mut matches: Vec<(DocumentId, TmpMatch)>, query_enhancer: &QueryEnhancer, + timeout: Option, ) -> SetBuf<(DocumentId, TmpMatch)> { let mut padded_matches = Vec::with_capacity(matches.len()); @@ -240,7 +247,7 @@ fn multiword_rewrite_matches( for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { let elapsed = start.elapsed(); - if elapsed > Duration::from_millis(10) { + if timeout.map_or(false, |timeout| elapsed > timeout) { info!("abort multiword rewrite after {:.2?}", elapsed); break; } @@ -341,6 +348,7 @@ where S: Store + Sync, let (automatons, query_enhancer) = generate_automatons(query, &self.store)?; let searchables = self.searchable_attrs.as_ref(); let store = &self.store; + let fetch_timeout = &self.fetch_timeout; rayon::scope(move |s| { enum Error { @@ -351,10 +359,10 @@ where S: Store + Sync, let mut matches = Vec::new(); let mut highlights = Vec::new(); - let recv_end_time = Instant::now() + Duration::from_millis(30); + let recv_end_time = fetch_timeout.map(|d| Instant::now() + d * 75 / 100); let start = Instant::now(); - let (sender, receiver) = crossbeam_channel::bounded(10); + let (sender, receiver) = crossbeam_channel::unbounded(); s.spawn(move |_| { let result = automatons @@ -417,6 +425,11 @@ where S: Store + Sync, }); let iter = receiver.recv().into_iter().chain(iter::from_fn(|| { + let recv_end_time = match recv_end_time { + Some(time) => time, + None => return receiver.recv().ok(), + }; + match recv_end_time.checked_duration_since(Instant::now()) { Some(timeout) => receiver.recv_timeout(timeout).ok(), None => None, @@ -434,7 +447,8 @@ where S: Store + Sync, info!("{} total matches to rewrite", matches.len()); let start = Instant::now(); - let matches = multiword_rewrite_matches(matches, &query_enhancer); + let timeout = fetch_timeout.map(|d| d * 25 / 100); + let matches = multiword_rewrite_matches(matches, &query_enhancer, timeout); info!("multiword rewrite took {:.2?}", start.elapsed()); let start = Instant::now(); @@ -526,7 +540,15 @@ impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> DistinctQueryBuilder { inner: self.inner.with_filter(function), function: self.function, - size: self.size + size: self.size, + } + } + + pub fn with_fetch_timeout(self, timeout: Duration) -> DistinctQueryBuilder<'c, I, FI, FD> { + DistinctQueryBuilder { + inner: self.inner.with_fetch_timeout(timeout), + function: self.function, + size: self.size, } } diff --git a/meilidb-data/src/database/synonyms_addition.rs b/meilidb-data/src/database/synonyms_addition.rs index c37f0475a..563cb228f 100644 --- a/meilidb-data/src/database/synonyms_addition.rs +++ b/meilidb-data/src/database/synonyms_addition.rs @@ -73,7 +73,7 @@ impl<'a> SynonymsAddition<'a> { // update the "consistent" view of the Index let words = main.words_set()?.unwrap_or_default(); - let ranked_map = lease_inner.ranked_map.clone();; + let ranked_map = lease_inner.ranked_map.clone(); let schema = lease_inner.schema.clone(); let raw = lease_inner.raw.clone(); lease_inner.raw.compact(); diff --git a/meilidb/examples/query-database.rs b/meilidb/examples/query-database.rs index e6368727a..58f91c383 100644 --- a/meilidb/examples/query-database.rs +++ b/meilidb/examples/query-database.rs @@ -159,7 +159,7 @@ fn main() -> Result<(), Box> { Ok(query) => { let start_total = Instant::now(); - let builder = index.query_builder(); + let builder = index.query_builder().with_fetch_timeout(Duration::from_millis(40)); let documents = builder.query(&query, 0..opt.number_results)?; let mut retrieve_duration = Duration::default(); From 8030a822ab59ed253a3fa99353b491eb545f928b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 28 Aug 2019 13:42:20 +0200 Subject: [PATCH 19/19] test: Add a way to setup the fetch timeout of the query-database example --- meilidb/examples/query-database.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/meilidb/examples/query-database.rs b/meilidb/examples/query-database.rs index 58f91c383..d939c0b70 100644 --- a/meilidb/examples/query-database.rs +++ b/meilidb/examples/query-database.rs @@ -24,6 +24,9 @@ pub struct Opt { #[structopt(parse(from_os_str))] pub database_path: PathBuf, + #[structopt(long = "fetch-timeout-ms")] + pub fetch_timeout_ms: Option, + /// Fields that must be displayed. pub displayed_fields: Vec, @@ -159,7 +162,13 @@ fn main() -> Result<(), Box> { Ok(query) => { let start_total = Instant::now(); - let builder = index.query_builder().with_fetch_timeout(Duration::from_millis(40)); + let builder = match opt.fetch_timeout_ms { + Some(timeout_ms) => { + let timeout = Duration::from_millis(timeout_ms); + index.query_builder().with_fetch_timeout(timeout) + }, + None => index.query_builder(), + }; let documents = builder.query(&query, 0..opt.number_results)?; let mut retrieve_duration = Duration::default();