diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index ecf99ee1c..ef9bf5324 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -1,17 +1,7 @@ mod dfa; mod query_enhancer; -use std::cmp::Reverse; -use std::{cmp, fmt, vec}; - -use fst::{IntoStreamer, Streamer}; -use levenshtein_automata::DFA; -use meilisearch_tokenizer::{is_cjk, split_query_string}; -use log::debug; - -use crate::database::MainT; -use crate::error::MResult; -use crate::store; +use meilisearch_tokenizer::is_cjk; pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; pub use self::query_enhancer::QueryEnhancer; @@ -19,122 +9,6 @@ pub use self::query_enhancer::QueryEnhancerBuilder; pub const NGRAMS: usize = 3; -pub struct AutomatonProducer { - automatons: Vec, -} - -impl AutomatonProducer { - pub fn new( - reader: &heed::RoTxn, - query: &str, - main_store: store::Main, - postings_list_store: store::PostingsLists, - synonyms_store: store::Synonyms, - ) -> MResult<(AutomatonProducer, QueryEnhancer)> { - let (automatons, query_enhancer) = generate_automatons( - reader, - query, - main_store, - postings_list_store, - synonyms_store, - )?; - - for (i, group) in automatons.iter().enumerate() { - debug!("all automatons: group {} automatons {:?}", i, group.automatons); - } - - Ok((AutomatonProducer { automatons }, query_enhancer)) - } - - pub fn into_iter(self) -> vec::IntoIter { - self.automatons.into_iter() - } -} - -#[derive(Debug)] -pub struct AutomatonGroup { - pub is_phrase_query: bool, - pub automatons: Vec, -} - -impl AutomatonGroup { - fn normal(automatons: Vec) -> AutomatonGroup { - AutomatonGroup { - is_phrase_query: false, - automatons, - } - } - - fn phrase_query(automatons: Vec) -> AutomatonGroup { - AutomatonGroup { - is_phrase_query: true, - automatons, - } - } -} - -pub struct Automaton { - pub index: usize, - pub ngram: usize, - pub query_len: usize, - pub is_exact: bool, - pub is_prefix: bool, - pub query: String, -} - -impl fmt::Debug for Automaton { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Automaton") - .field("index", &self.index) - .field("query", &self.query) - .field("is_prefix", &self.is_prefix) - .finish() - } -} - -impl Automaton { - pub fn dfa(&self) -> DFA { - if self.is_prefix { - build_prefix_dfa(&self.query) - } else { - build_dfa(&self.query) - } - } - - fn exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: true, - is_prefix: false, - query: query.to_string(), - } - } - - fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: true, - is_prefix: true, - query: query.to_string(), - } - } - - fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: false, - is_prefix: false, - query: query.to_string(), - } - } -} - pub fn normalize_str(string: &str) -> String { let mut string = string.to_lowercase(); @@ -144,167 +18,3 @@ pub fn normalize_str(string: &str) -> String { string } - -pub fn split_best_frequency<'a>( - reader: &heed::RoTxn, - word: &'a str, - postings_lists_store: store::PostingsLists, -) -> MResult> { - let chars = word.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = word.split_at(i); - - let left_freq = postings_lists_store - .postings_list(reader, left.as_ref())? - .map_or(0, |i| i.len()); - - let right_freq = postings_lists_store - .postings_list(reader, right.as_ref())? - .map_or(0, |i| i.len()); - - let min_freq = cmp::min(left_freq, right_freq); - if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { - best = Some((min_freq, left, right)); - } - } - - Ok(best.map(|(_, l, r)| (l, r))) -} - -fn generate_automatons( - reader: &heed::RoTxn, - query: &str, - main_store: store::Main, - postings_lists_store: store::PostingsLists, - synonym_store: store::Synonyms, -) -> MResult<(Vec, QueryEnhancer)> { - let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); - let synonyms = match main_store.synonyms_fst(reader)? { - Some(synonym) => synonym, - None => fst::Set::default(), - }; - - let mut automaton_index = 0; - let mut automatons = Vec::new(); - let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); - - // We must not declare the original words to the query enhancer - // *but* we need to push them in the automatons list first - let mut original_automatons = Vec::new(); - let mut original_words = query_words.iter().peekable(); - while let Some(word) = original_words.next() { - let has_following_word = original_words.peek().is_some(); - let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); - - let automaton = if not_prefix_dfa { - Automaton::exact(automaton_index, 1, word) - } else { - Automaton::prefix_exact(automaton_index, 1, word) - }; - automaton_index += 1; - original_automatons.push(automaton); - } - - automatons.push(AutomatonGroup::normal(original_automatons)); - - for n in 1..=NGRAMS { - let mut ngrams = query_words.windows(n).enumerate().peekable(); - while let Some((query_index, ngram_slice)) = ngrams.next() { - let query_range = query_index..query_index + n; - let ngram_nb_words = ngram_slice.len(); - let ngram = ngram_slice.join(" "); - - let has_following_word = ngrams.peek().is_some(); - let not_prefix_dfa = - has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); - - // automaton of synonyms of the ngrams - let normalized = normalize_str(&ngram); - let lev = if not_prefix_dfa { - build_dfa(&normalized) - } else { - build_prefix_dfa(&normalized) - }; - - let mut stream = synonyms.search(&lev).into_stream(); - while let Some(base) = stream.next() { - // only trigger alternatives when the last word has been typed - // i.e. "new " do not but "new yo" triggers alternatives to "new york" - let base = std::str::from_utf8(base).unwrap(); - let base_nb_words = split_query_string(base).count(); - if ngram_nb_words != base_nb_words { - continue; - } - - if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { - let mut stream = synonyms.into_stream(); - while let Some(synonyms) = stream.next() { - let synonyms = std::str::from_utf8(synonyms).unwrap(); - let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); - let nb_synonym_words = synonyms_words.len(); - - let real_query_index = automaton_index; - enhancer_builder.declare( - query_range.clone(), - real_query_index, - &synonyms_words, - ); - - for synonym in synonyms_words { - let automaton = if nb_synonym_words == 1 { - Automaton::exact(automaton_index, n, synonym) - } else { - Automaton::non_exact(automaton_index, n, synonym) - }; - automaton_index += 1; - automatons.push(AutomatonGroup::normal(vec![automaton])); - } - } - } - } - - if n == 1 { - if let Some((left, right)) = - split_best_frequency(reader, &normalized, postings_lists_store)? - { - let a = Automaton::exact(automaton_index, 1, left); - enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); - automaton_index += 1; - - let b = Automaton::exact(automaton_index, 1, right); - enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); - automaton_index += 1; - - automatons.push(AutomatonGroup::phrase_query(vec![a, b])); - } - } else { - // automaton of concatenation of query words - let concat = ngram_slice.concat(); - let normalized = normalize_str(&concat); - - let real_query_index = automaton_index; - enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); - - let automaton = Automaton::exact(automaton_index, n, &normalized); - automaton_index += 1; - automatons.push(AutomatonGroup::normal(vec![automaton])); - } - } - } - - // order automatons, the most important first, - // we keep the original automatons at the front. - automatons[1..].sort_by_key(|group| { - let a = group.automatons.first().unwrap(); - ( - Reverse(a.is_exact), - a.ngram, - Reverse(group.automatons.len()), - ) - }); - - Ok((automatons, enhancer_builder.build())) -} diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 0c5fbdee3..9502f2562 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -1,5 +1,5 @@ use std::ops::Deref; -use std::fmt; +use std::{cmp, fmt}; use std::borrow::Cow; use std::mem; use std::ops::Range; @@ -8,43 +8,68 @@ use std::time::{Duration, Instant}; use compact_arena::{SmallArena, Idx32, mk_arena}; use fst::{IntoStreamer, Streamer}; +use hashbrown::HashMap; use levenshtein_automata::DFA; use log::debug; use meilisearch_tokenizer::{is_cjk, split_query_string}; -use meilisearch_types::{DocIndex, Highlight}; +use meilisearch_types::DocIndex; use sdset::{Set, SetBuf}; use slice_group_by::{GroupBy, GroupByMut}; use crate::automaton::NGRAMS; -use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; -use crate::automaton::{normalize_str, split_best_frequency}; +use crate::automaton::normalize_str; +use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; use crate::criterion::Criteria; -use crate::levenshtein::prefix_damerau_levenshtein; +use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::raw_document::RawDocument; use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; use crate::{store, Document, DocumentId, MResult}; -pub fn bucket_sort<'c>( +pub fn bucket_sort<'c, FI>( reader: &heed::RoTxn, query: &str, range: Range, + filter: Option, criteria: Criteria<'c>, main_store: store::Main, postings_lists_store: store::PostingsLists, documents_fields_counts_store: store::DocumentsFieldsCounts, synonyms_store: store::Synonyms, ) -> MResult> +where + FI: Fn(DocumentId) -> bool, { + // We delegate the filter work to the distinct query builder, + // specifying a distinct rule that has no effect. + if filter.is_some() { + let distinct = |_| None; + let distinct_size = 1; + return bucket_sort_with_distinct( + reader, + query, + range, + filter, + distinct, + distinct_size, + criteria, + main_store, + postings_lists_store, + documents_fields_counts_store, + synonyms_store, + ); + } + let (automatons, query_enhancer) = - construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; + construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; debug!("{:?}", query_enhancer); let before_postings_lists_fetching = Instant::now(); mk_arena!(arena); - let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; + let mut bare_matches = + fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; debug!("bare matches ({}) retrieved in {:.02?}", bare_matches.len(), before_postings_lists_fetching.elapsed(), @@ -69,9 +94,6 @@ pub fn bucket_sort<'c>( before_raw_documents_building.elapsed(), ); - dbg!(mem::size_of::()); - dbg!(mem::size_of::()); - let mut groups = vec![raw_documents.as_mut_slice()]; 'criteria: for criterion in criteria.as_ref() { @@ -103,31 +125,166 @@ pub fn bucket_sort<'c>( } let iter = raw_documents.into_iter().skip(range.start).take(range.len()); - let iter = iter.map(|d| { - let highlights = d.raw_matches.iter().flat_map(|sm| { - let postings_list = &arena[sm.postings_list]; - let input = postings_list.input(); - let query = &automatons[sm.query_index as usize].query; - postings_list.iter().map(move |m| { - let covered_area = if query.len() > input.len() { - input.len() - } else { - prefix_damerau_levenshtein(query.as_bytes(), input).1 - }; - Highlight { attribute: m.attribute, char_index: m.char_index, char_length: covered_area as u16 } - }) - }).collect(); - - Document { - id: d.id, - highlights, - #[cfg(test)] matches: Vec::new(), - } - }); + let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena)); Ok(iter.collect()) } +pub fn bucket_sort_with_distinct<'c, FI, FD>( + reader: &heed::RoTxn, + query: &str, + range: Range, + filter: Option, + distinct: FD, + distinct_size: usize, + criteria: Criteria<'c>, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + documents_fields_counts_store: store::DocumentsFieldsCounts, + synonyms_store: store::Synonyms, +) -> MResult> +where + FI: Fn(DocumentId) -> bool, + FD: Fn(DocumentId) -> Option, +{ + let (automatons, query_enhancer) = + construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; + + let before_postings_lists_fetching = Instant::now(); + mk_arena!(arena); + let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; + debug!("bare matches ({}) retrieved in {:.02?}", + bare_matches.len(), + before_postings_lists_fetching.elapsed(), + ); + + let before_raw_documents_presort = Instant::now(); + bare_matches.sort_unstable_by_key(|sm| sm.document_id); + debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); + + let before_raw_documents_building = Instant::now(); + let mut prefiltered_documents = 0; + let mut raw_documents = Vec::new(); + for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + prefiltered_documents += 1; + if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) { + raw_documents.push(raw_document); + } + } + debug!("creating {} (original {}) candidates documents took {:.02?}", + raw_documents.len(), + prefiltered_documents, + before_raw_documents_building.elapsed(), + ); + + let mut groups = vec![raw_documents.as_mut_slice()]; + let mut key_cache = HashMap::new(); + + let mut filter_map = HashMap::new(); + // these two variables informs on the current distinct map and + // on the raw offset of the start of the group where the + // range.start bound is located according to the distinct function + let mut distinct_map = DistinctMap::new(distinct_size); + let mut distinct_raw_offset = 0; + + 'criteria: for criterion in criteria.as_ref() { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); + let mut documents_seen = 0; + + for mut group in tmp_groups { + // if this group does not overlap with the requested range, + // push it without sorting and splitting it + if documents_seen + group.len() < distinct_raw_offset { + documents_seen += group.len(); + groups.push(group); + continue; + } + + let before_criterion_preparation = Instant::now(); + criterion.prepare(&mut group, &mut arena, &query_enhancer, &automatons); + debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); + + let before_criterion_sort = Instant::now(); + group.sort_unstable_by(|a, b| criterion.evaluate(a, b, &arena)); + debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, &arena)) { + // we must compute the real distinguished len of this sub-group + for document in group.iter() { + let filter_accepted = match &filter { + Some(filter) => { + let entry = filter_map.entry(document.id); + *entry.or_insert_with(|| (filter)(document.id)) + } + None => true, + }; + + if filter_accepted { + let entry = key_cache.entry(document.id); + let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new)); + + match key.clone() { + Some(key) => buf_distinct.register(key), + None => buf_distinct.register_without_key(), + }; + } + + // the requested range end is reached: stop computing distinct + if buf_distinct.len() >= range.end { + break; + } + } + + documents_seen += group.len(); + groups.push(group); + + // if this sub-group does not overlap with the requested range + // we must update the distinct map and its start index + if buf_distinct.len() < range.start { + buf_distinct.transfert_to_internal(); + distinct_raw_offset = documents_seen; + } + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if buf_distinct.len() >= range.end { + continue 'criteria; + } + } + } + } + + // once we classified the documents related to the current + // automatons we save that as the next valid result + let mut seen = BufferedDistinctMap::new(&mut distinct_map); + + let mut documents = Vec::with_capacity(range.len()); + for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) { + let filter_accepted = match &filter { + Some(_) => filter_map.remove(&raw_document.id).unwrap(), + None => true, + }; + + if filter_accepted { + let key = key_cache.remove(&raw_document.id).unwrap(); + let distinct_accepted = match key { + Some(key) => seen.register(key), + None => seen.register_without_key(), + }; + + if distinct_accepted && seen.len() > range.start { + documents.push(Document::from_raw(raw_document, &automatons, &arena)); + if documents.len() == range.len() { + break; + } + } + } + } + + Ok(documents) +} + pub struct BareMatch<'tag> { pub document_id: DocumentId, pub query_index: u16, @@ -257,7 +414,7 @@ fn fetch_matches<'txn, 'tag>( postings_lists_store: store::PostingsLists, ) -> MResult>> { - let mut before_words_fst = Instant::now(); + let before_words_fst = Instant::now(); let words = match main_store.words_fst(reader)? { Some(words) => words, None => return Ok(Vec::new()), @@ -273,7 +430,7 @@ fn fetch_matches<'txn, 'tag>( for (query_index, automaton) in automatons.iter().enumerate() { let before_dfa = Instant::now(); let dfa = automaton.dfa(); - let QueryWordAutomaton { query, is_exact, is_prefix, phrase_query } = automaton; + let QueryWordAutomaton { query, is_exact, .. } = automaton; dfa_time += before_dfa.elapsed(); let mut number_of_words = 0; @@ -381,7 +538,35 @@ impl QueryWordAutomaton { } } -fn construct_automatons2( +fn split_best_frequency<'a>( + reader: &heed::RoTxn, + word: &'a str, + postings_lists_store: store::PostingsLists, +) -> MResult> { + let chars = word.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = word.split_at(i); + + let left_freq = postings_lists_store + .postings_list(reader, left.as_ref())? + .map_or(0, |i| i.len()); + + let right_freq = postings_lists_store + .postings_list(reader, right.as_ref())? + .map_or(0, |i| i.len()); + + let min_freq = cmp::min(left_freq, right_freq); + if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { + best = Some((min_freq, left, right)); + } + } + + Ok(best.map(|(_, l, r)| (l, r))) +} + +fn construct_automatons( reader: &heed::RoTxn, query: &str, main_store: store::Main, diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 01fb05372..fb1975a0b 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -30,6 +30,10 @@ pub use self::store::Index; pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount}; +use compact_arena::SmallArena; +use crate::bucket_sort::{QueryWordAutomaton, PostingsListView}; +use crate::levenshtein::prefix_damerau_levenshtein; + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Document { pub id: DocumentId, @@ -39,6 +43,36 @@ pub struct Document { // pub matches: Vec, } +impl Document { + pub fn from_raw<'a, 'tag, 'txn>( + raw_document: RawDocument<'a, 'tag>, + automatons: &[QueryWordAutomaton], + arena: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Document + { + let highlights = raw_document.raw_matches.iter().flat_map(|sm| { + let postings_list = &arena[sm.postings_list]; + let input = postings_list.input(); + let query = &automatons[sm.query_index as usize].query; + postings_list.iter().map(move |m| { + let covered_area = if query.len() > input.len() { + input.len() + } else { + prefix_damerau_levenshtein(query.as_bytes(), input).1 + }; + + Highlight { + attribute: m.attribute, + char_index: m.char_index, + char_length: covered_area as u16, + } + }) + }).collect(); + + Document { id: raw_document.id, highlights } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 3a9750ec0..c0a12e34f 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -1,7 +1,8 @@ use std::ops::Range; use std::time::Duration; -use crate::{bucket_sort::bucket_sort, database::MainT}; +use crate::database::MainT; +use crate::bucket_sort::{bucket_sort, bucket_sort_with_distinct}; use crate::{criterion::Criteria, Document, DocumentId}; use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; @@ -85,11 +86,24 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { range: Range, ) -> MResult> { match self.distinct { - Some((distinct, distinct_size)) => unimplemented!("distinct"), + Some((distinct, distinct_size)) => bucket_sort_with_distinct( + reader, + query, + range, + self.filter, + distinct, + distinct_size, + self.criteria, + self.main_store, + self.postings_lists_store, + self.documents_fields_counts_store, + self.synonyms_store, + ), None => bucket_sort( reader, query, range, + self.filter, self.criteria, self.main_store, self.postings_lists_store, diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 4096eeaba..8e511d7eb 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -44,7 +44,7 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { let pla = &postings_lists[a.postings_list]; let plb = &postings_lists[b.postings_list]; - let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { + let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) });