diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 5413db17f..0c5fbdee3 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -1,9 +1,6 @@ use std::ops::Deref; use std::fmt; use std::borrow::Cow; -use std::cmp::Ordering; -use std::collections::HashSet; -use std::io::Write; use std::mem; use std::ops::Range; use std::rc::Rc; @@ -17,15 +14,15 @@ use meilisearch_tokenizer::{is_cjk, split_query_string}; use meilisearch_types::{DocIndex, Highlight}; use sdset::{Set, SetBuf}; use slice_group_by::{GroupBy, GroupByMut}; -use itertools::EitherOrBoth; use crate::automaton::NGRAMS; use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; use crate::automaton::{normalize_str, split_best_frequency}; -use crate::criterion2::*; +use crate::criterion::Criteria; use crate::levenshtein::prefix_damerau_levenshtein; +use crate::raw_document::RawDocument; use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; use crate::{store, Document, DocumentId, MResult}; @@ -33,6 +30,7 @@ pub fn bucket_sort<'c>( reader: &heed::RoTxn, query: &str, range: Range, + criteria: Criteria<'c>, main_store: store::Main, postings_lists_store: store::PostingsLists, documents_fields_counts_store: store::DocumentsFieldsCounts, @@ -76,17 +74,7 @@ pub fn bucket_sort<'c>( let mut groups = vec![raw_documents.as_mut_slice()]; - let criteria = [ - Box::new(Typo) as Box, - Box::new(Words), - Box::new(Proximity), - Box::new(Attribute), - Box::new(WordsPosition), - Box::new(Exact), - Box::new(StableDocId), - ]; - - 'criteria: for criterion in &criteria { + 'criteria: for criterion in criteria.as_ref() { let tmp_groups = mem::replace(&mut groups, Vec::new()); let mut documents_seen = 0; @@ -131,7 +119,7 @@ pub fn bucket_sort<'c>( }).collect(); Document { - id: d.raw_matches[0].document_id, + id: d.id, highlights, #[cfg(test)] matches: Vec::new(), } @@ -140,88 +128,6 @@ pub fn bucket_sort<'c>( Ok(iter.collect()) } -pub struct RawDocument<'a, 'tag> { - pub raw_matches: &'a mut [BareMatch<'tag>], - pub processed_matches: Vec, - /// The list of minimum `distance` found - pub processed_distances: Vec>, -} - -impl<'a, 'tag> RawDocument<'a, 'tag> { - fn new<'txn>( - raw_matches: &'a mut [BareMatch<'tag>], - automatons: &[QueryWordAutomaton], - postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, - ) -> Option> - { - raw_matches.sort_unstable_by_key(|m| m.query_index); - - let mut previous_word = None; - for i in 0..raw_matches.len() { - let a = &raw_matches[i]; - let auta = &automatons[a.query_index as usize]; - - match auta.phrase_query { - Some((0, _)) => { - let b = match raw_matches.get(i + 1) { - Some(b) => b, - None => { - postings_lists[a.postings_list].rewrite_with(SetBuf::default()); - continue; - } - }; - - if a.query_index + 1 != b.query_index { - postings_lists[a.postings_list].rewrite_with(SetBuf::default()); - continue - } - - let pla = &postings_lists[a.postings_list]; - let plb = &postings_lists[b.postings_list]; - - let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { - a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) - }); - - let mut newa = Vec::new(); - let mut newb = Vec::new(); - - for eb in iter { - if let EitherOrBoth::Both(a, b) = eb { - newa.push(*a); - newb.push(*b); - } - } - - if !newa.is_empty() { - previous_word = Some(a.query_index); - } - - postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); - postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); - }, - Some((1, _)) => { - if previous_word.take() != Some(a.query_index - 1) { - postings_lists[a.postings_list].rewrite_with(SetBuf::default()); - } - }, - Some((_, _)) => unreachable!(), - None => (), - } - } - - if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { - return None - } - - Some(RawDocument { - raw_matches, - processed_matches: Vec::new(), - processed_distances: Vec::new(), - }) - } -} - pub struct BareMatch<'tag> { pub document_id: DocumentId, pub query_index: u16, diff --git a/meilisearch-core/src/criterion/attribute.rs b/meilisearch-core/src/criterion/attribute.rs new file mode 100644 index 000000000..3dc6e4282 --- /dev/null +++ b/meilisearch-core/src/criterion/attribute.rs @@ -0,0 +1,48 @@ +use std::cmp::{self, Ordering}; + +use compact_arena::SmallArena; +use slice_group_by::GroupBy; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; +use crate::RawDocument; + +use super::{Criterion, prepare_raw_matches}; + +pub struct Attribute; + +impl Criterion for Attribute { + fn name(&self) -> &str { "attribute" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Ordering + { + #[inline] + fn best_attribute(matches: &[SimpleMatch]) -> u16 { + let mut best_attribute = u16::max_value(); + for group in matches.linear_group_by_key(|bm| bm.query_index) { + best_attribute = cmp::min(best_attribute, group[0].attribute); + } + best_attribute + } + + let lhs = best_attribute(&lhs.processed_matches); + let rhs = best_attribute(&rhs.processed_matches); + + lhs.cmp(&rhs) + } +} diff --git a/meilisearch-core/src/criterion/document_id.rs b/meilisearch-core/src/criterion/document_id.rs index e4a402d26..596194bca 100644 --- a/meilisearch-core/src/criterion/document_id.rs +++ b/meilisearch-core/src/criterion/document_id.rs @@ -1,16 +1,37 @@ -use crate::criterion::Criterion; -use crate::RawDocument; use std::cmp::Ordering; -#[derive(Debug, Clone, Copy)] +use compact_arena::SmallArena; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; +use crate::RawDocument; +use super::Criterion; + pub struct DocumentId; impl Criterion for DocumentId { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - lhs.id.cmp(&rhs.id) + fn name(&self) -> &str { "stable document id" } + + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut SmallArena, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + // ... } - fn name(&self) -> &str { - "DocumentId" + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &SmallArena, + ) -> Ordering + { + let lhs = &lhs.id; + let rhs = &rhs.id; + + lhs.cmp(rhs) } } diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index 55a19001b..d82f69462 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -1,131 +1,51 @@ -use std::cmp::Ordering; +use std::cmp::{Ordering, Reverse}; -use sdset::Set; +use compact_arena::SmallArena; use slice_group_by::GroupBy; -use crate::criterion::Criterion; -use crate::{AttrCount, RawDocument}; +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, BareMatch, QueryWordAutomaton}; +use crate::RawDocument; +use super::Criterion; -#[inline] -fn number_exact_matches( - query_index: &[u32], - attribute: &[u16], - is_exact: &[bool], - fields_counts: &Set, -) -> usize { - let mut count = 0; - let mut index = 0; - - for group in query_index.linear_group() { - let len = group.len(); - - let mut found_exact = false; - for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() { - if *is_exact { - found_exact = true; - let attr = &attribute[index + pos]; - if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) { - let AttrCount { count, .. } = fields_counts[pos]; - if count == 1 { - return usize::max_value(); - } - } - } - } - - count += found_exact as usize; - index += len; - } - - count -} - -#[derive(Debug, Clone, Copy)] pub struct Exact; impl Criterion for Exact { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let is_exact = lhs.is_exact(); - let attribute = lhs.attribute(); - let fields_counts = lhs.fields_counts.as_ref().unwrap(); + fn name(&self) -> &str { "exact" } - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut SmallArena, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + for document in documents { + document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); + } + } - let rhs = { - let query_index = rhs.query_index(); - let is_exact = rhs.is_exact(); - let attribute = rhs.attribute(); - let fields_counts = rhs.fields_counts.as_ref().unwrap(); + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &SmallArena, + ) -> Ordering + { + #[inline] + fn sum_exact_query_words(matches: &[BareMatch]) -> usize { + let mut sum_exact_query_words = 0; - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_exact_query_words += group[0].is_exact as usize; + } + + sum_exact_query_words + } + + let lhs = sum_exact_query_words(&lhs.raw_matches); + let rhs = sum_exact_query_words(&rhs.raw_matches); lhs.cmp(&rhs).reverse() } - - fn name(&self) -> &str { - "Exact" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "soulier" - // - // doc0: "Soulier bleu" - // doc1: "souliereres rouge" - #[test] - fn easy_case() { - let doc0 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; - let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - let doc1 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[false]; - let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } - - // typing: "soulier" - // - // doc0: { 0. "soulier" } - // doc1: { 0. "soulier bleu et blanc" } - #[test] - fn basic() { - let doc0 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; - let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - let doc1 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; - let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } } diff --git a/meilisearch-core/src/criterion/mod.rs b/meilisearch-core/src/criterion/mod.rs index e94b1b2c7..0d54d89f2 100644 --- a/meilisearch-core/src/criterion/mod.rs +++ b/meilisearch-core/src/criterion/mod.rs @@ -1,58 +1,58 @@ -mod document_id; -mod exact; -mod number_of_words; -mod sort_by_attr; -mod sum_of_typos; -mod sum_of_words_attribute; -mod sum_of_words_position; -mod words_proximity; +use std::cmp::{self, Ordering}; +use compact_arena::SmallArena; +use sdset::SetBuf; +use slice_group_by::GroupBy; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; use crate::RawDocument; -use std::cmp::Ordering; -pub use self::{ - document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords, - sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos, - sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, - words_proximity::WordsProximity, -}; +mod typo; +mod words; +mod proximity; +mod attribute; +mod words_position; +mod exact; +mod document_id; +mod sort_by_attr; -pub trait Criterion: Send + Sync { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; +pub use self::typo::Typo; +pub use self::words::Words; +pub use self::proximity::Proximity; +pub use self::attribute::Attribute; +pub use self::words_position::WordsPosition; +pub use self::exact::Exact; +pub use self::document_id::DocumentId; +pub use self::sort_by_attr::SortByAttr; +pub trait Criterion { fn name(&self) -> &str; + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ); + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Ordering; + #[inline] - fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { - self.evaluate(lhs, rhs) == Ordering::Equal - } -} - -impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - (**self).evaluate(lhs, rhs) - } - - fn name(&self) -> &str { - (**self).name() - } - - fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { - (**self).eq(lhs, rhs) - } -} - -impl Criterion for Box { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - (**self).evaluate(lhs, rhs) - } - - fn name(&self) -> &str { - (**self).name() - } - - fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { - (**self).eq(lhs, rhs) + fn eq<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> bool + { + self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal } } @@ -103,11 +103,11 @@ pub struct Criteria<'a> { impl<'a> Default for Criteria<'a> { fn default() -> Self { CriteriaBuilder::with_capacity(7) - .add(SumOfTypos) - .add(NumberOfWords) - .add(WordsProximity) - .add(SumOfWordsAttribute) - .add(SumOfWordsPosition) + .add(Typo) + .add(Words) + .add(Proximity) + .add(Attribute) + .add(WordsPosition) .add(Exact) .add(DocumentId) .build() @@ -119,3 +119,165 @@ impl<'a> AsRef<[Box]> for Criteria<'a> { &self.inner } } + +fn prepare_query_distances<'a, 'tag, 'txn>( + documents: &mut [RawDocument<'a, 'tag>], + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, +) { + for document in documents { + if !document.processed_distances.is_empty() { continue } + + let mut processed = Vec::new(); + for m in document.raw_matches.iter() { + if postings_lists[m.postings_list].is_empty() { continue } + + let range = query_enhancer.replacement(m.query_index as u32); + let new_len = cmp::max(range.end as usize, processed.len()); + processed.resize(new_len, None); + + for index in range { + let index = index as usize; + processed[index] = match processed[index] { + Some(distance) if distance > m.distance => Some(m.distance), + Some(distance) => Some(distance), + None => Some(m.distance), + }; + } + } + + document.processed_distances = processed; + } +} + +fn prepare_raw_matches<'a, 'tag, 'txn>( + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], +) { + for document in documents { + if !document.processed_matches.is_empty() { continue } + + let mut processed = Vec::new(); + for m in document.raw_matches.iter() { + let postings_list = &postings_lists[m.postings_list]; + processed.reserve(postings_list.len()); + for di in postings_list.as_ref() { + let simple_match = SimpleMatch { + query_index: m.query_index, + distance: m.distance, + attribute: di.attribute, + word_index: di.word_index, + is_exact: m.is_exact, + }; + processed.push(simple_match); + } + } + + let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); + document.processed_matches = processed.into_vec(); + } +} + +fn multiword_rewrite_matches( + matches: &mut [SimpleMatch], + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], +) -> SetBuf +{ + matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); + + let mut padded_matches = Vec::with_capacity(matches.len()); + + // let before_padding = Instant::now(); + // for each attribute of each document + for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { + // padding will only be applied + // to word indices in the same attribute + let mut padding = 0; + let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); + + // for each match at the same position + // in this document attribute + while let Some(same_word_index) = iter.next() { + // find the biggest padding + let mut biggest = 0; + for match_ in same_word_index { + let mut replacement = query_enhancer.replacement(match_.query_index as u32); + let replacement_len = replacement.len(); + let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); + + if let Some(query_index) = replacement.next() { + let word_index = match_.word_index + padding as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + } + + let mut found = false; + + // look ahead and if there already is a match + // corresponding to this padding word, abort the padding + 'padding: for (x, next_group) in nexts.enumerate() { + for (i, query_index) in replacement.clone().enumerate().skip(x) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; + + for nmatch_ in next_group { + let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); + let query_index = rep.next().unwrap() as u16; + if query_index == padmatch.query_index { + if !found { + // if we find a corresponding padding for the + // first time we must push preceding paddings + for (i, query_index) in replacement.clone().enumerate().take(i) + { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + biggest = biggest.max(i + 1); + } + } + + padded_matches.push(padmatch); + found = true; + continue 'padding; + } + } + } + + // if we do not find a corresponding padding in the + // next groups so stop here and pad what was found + break; + } + + if !found { + // if no padding was found in the following matches + // we must insert the entire padding + for (i, query_index) in replacement.enumerate() { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + } + + biggest = biggest.max(replacement_len - 1); + } + } + + padding += biggest; + } + } + + // debug!("padding matches took {:.02?}", before_padding.elapsed()); + + // With this check we can see that the loop above takes something + // like 43% of the search time even when no rewrite is needed. + // assert_eq!(before_matches, padded_matches); + + SetBuf::from_dirty(padded_matches) +} diff --git a/meilisearch-core/src/criterion/number_of_words.rs b/meilisearch-core/src/criterion/number_of_words.rs deleted file mode 100644 index 6c1218e2f..000000000 --- a/meilisearch-core/src/criterion/number_of_words.rs +++ /dev/null @@ -1,31 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::Ordering; - -#[inline] -fn number_of_query_words(query_index: &[u32]) -> usize { - query_index.linear_group().count() -} - -#[derive(Debug, Clone, Copy)] -pub struct NumberOfWords; - -impl Criterion for NumberOfWords { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - number_of_query_words(query_index) - }; - let rhs = { - let query_index = rhs.query_index(); - number_of_query_words(query_index) - }; - - lhs.cmp(&rhs).reverse() - } - - fn name(&self) -> &str { - "NumberOfWords" - } -} diff --git a/meilisearch-core/src/criterion/proximity.rs b/meilisearch-core/src/criterion/proximity.rs new file mode 100644 index 000000000..c9c534ca8 --- /dev/null +++ b/meilisearch-core/src/criterion/proximity.rs @@ -0,0 +1,79 @@ +use std::cmp::{self, Ordering}; + +use compact_arena::SmallArena; +use slice_group_by::GroupBy; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton}; +use crate::RawDocument; + +use super::{Criterion, prepare_raw_matches}; + +pub struct Proximity; + +impl Criterion for Proximity { + fn name(&self) -> &str { "proximity" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Ordering + { + const MAX_DISTANCE: u16 = 8; + + fn index_proximity(lhs: u16, rhs: u16) -> u16 { + if lhs < rhs { + cmp::min(rhs - lhs, MAX_DISTANCE) + } else { + cmp::min(lhs - rhs, MAX_DISTANCE) + 1 + } + } + + fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { + if lhs.attribute != rhs.attribute { MAX_DISTANCE } + else { index_proximity(lhs.word_index, rhs.word_index) } + } + + fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { + let mut min_prox = u16::max_value(); + for a in lhs { + for b in rhs { + let prox = attribute_proximity(*a, *b); + min_prox = cmp::min(min_prox, prox); + } + } + min_prox + } + + fn matches_proximity(matches: &[SimpleMatch],) -> u16 { + let mut proximity = 0; + let mut iter = matches.linear_group_by_key(|m| m.query_index); + + // iterate over groups by windows of size 2 + let mut last = iter.next(); + while let (Some(lhs), Some(rhs)) = (last, iter.next()) { + proximity += min_proximity(lhs, rhs); + last = Some(rhs); + } + + proximity + } + + let lhs = matches_proximity(&lhs.processed_matches); + let rhs = matches_proximity(&rhs.processed_matches); + + lhs.cmp(&rhs) + } +} diff --git a/meilisearch-core/src/criterion/sort_by_attr.rs b/meilisearch-core/src/criterion/sort_by_attr.rs index 89595e5a5..ea1c016da 100644 --- a/meilisearch-core/src/criterion/sort_by_attr.rs +++ b/meilisearch-core/src/criterion/sort_by_attr.rs @@ -2,9 +2,13 @@ use std::cmp::Ordering; use std::error::Error; use std::fmt; +use compact_arena::SmallArena; +use meilisearch_schema::{Schema, SchemaAttr}; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; use crate::criterion::Criterion; use crate::{RankedMap, RawDocument}; -use meilisearch_schema::{Schema, SchemaAttr}; /// An helper struct that permit to sort documents by /// some of their stored attributes. @@ -28,11 +32,11 @@ use meilisearch_schema::{Schema, SchemaAttr}; /// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?; /// /// let builder = CriteriaBuilder::with_capacity(8) -/// .add(SumOfTypos) -/// .add(NumberOfWords) -/// .add(WordsProximity) -/// .add(SumOfWordsAttribute) -/// .add(SumOfWordsPosition) +/// .add(Typo) +/// .add(Words) +/// .add(Proximity) +/// .add(Attribute) +/// .add(WordsPosition) /// .add(Exact) /// .add(custom_ranking) /// .add(DocumentId); @@ -86,8 +90,28 @@ impl<'a> SortByAttr<'a> { } } -impl<'a> Criterion for SortByAttr<'a> { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { +impl Criterion for SortByAttr<'_> { + fn name(&self) -> &str { + "sort by attribute" + } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + // ... + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Ordering + { let lhs = self.ranked_map.get(lhs.id, self.attr); let rhs = self.ranked_map.get(rhs.id, self.attr); @@ -105,10 +129,6 @@ impl<'a> Criterion for SortByAttr<'a> { (None, None) => Ordering::Equal, } } - - fn name(&self) -> &str { - "SortByAttr" - } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] diff --git a/meilisearch-core/src/criterion/sum_of_typos.rs b/meilisearch-core/src/criterion/sum_of_typos.rs deleted file mode 100644 index 5cad73b42..000000000 --- a/meilisearch-core/src/criterion/sum_of_typos.rs +++ /dev/null @@ -1,116 +0,0 @@ -use std::cmp::Ordering; - -use slice_group_by::GroupBy; - -use crate::criterion::Criterion; -use crate::RawDocument; - -// This function is a wrong logarithmic 10 function. -// It is safe to panic on input number higher than 3, -// the number of typos is never bigger than that. -#[inline] -fn custom_log10(n: u8) -> f32 { - match n { - 0 => 0.0, // log(1) - 1 => 0.30102, // log(2) - 2 => 0.47712, // log(3) - 3 => 0.60205, // log(4) - _ => panic!("invalid number"), - } -} - -#[inline] -fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize { - let mut number_words: usize = 0; - let mut sum_typos = 0.0; - let mut index = 0; - - for group in query_index.linear_group() { - sum_typos += custom_log10(distance[index]); - number_words += 1; - index += group.len(); - } - - (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize -} - -#[derive(Debug, Clone, Copy)] -pub struct SumOfTypos; - -impl Criterion for SumOfTypos { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let distance = lhs.distance(); - sum_matches_typos(query_index, distance) - }; - - let rhs = { - let query_index = rhs.query_index(); - let distance = rhs.distance(); - sum_matches_typos(query_index, distance) - }; - - lhs.cmp(&rhs).reverse() - } - - fn name(&self) -> &str { - "SumOfTypos" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "Geox CEO" - // - // doc0: "Geox SpA: CEO and Executive" - // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" - #[test] - fn one_typo_reference() { - let query_index0 = &[0, 1]; - let distance0 = &[0, 0]; - - let query_index1 = &[0, 1]; - let distance1 = &[1, 0]; - - let doc0 = sum_matches_typos(query_index0, distance0); - let doc1 = sum_matches_typos(query_index1, distance1); - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } - - // typing: "bouton manchette" - // - // doc0: "bouton manchette" - // doc1: "bouton" - #[test] - fn no_typo() { - let query_index0 = &[0, 1]; - let distance0 = &[0, 0]; - - let query_index1 = &[0]; - let distance1 = &[0]; - - let doc0 = sum_matches_typos(query_index0, distance0); - let doc1 = sum_matches_typos(query_index1, distance1); - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } - - // typing: "bouton manchztte" - // - // doc0: "bouton manchette" - // doc1: "bouton" - #[test] - fn one_typo() { - let query_index0 = &[0, 1]; - let distance0 = &[0, 1]; - - let query_index1 = &[0]; - let distance1 = &[0]; - - let doc0 = sum_matches_typos(query_index0, distance0); - let doc1 = sum_matches_typos(query_index1, distance1); - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } -} diff --git a/meilisearch-core/src/criterion/sum_of_words_attribute.rs b/meilisearch-core/src/criterion/sum_of_words_attribute.rs deleted file mode 100644 index 472d771b7..000000000 --- a/meilisearch-core/src/criterion/sum_of_words_attribute.rs +++ /dev/null @@ -1,64 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::Ordering; - -#[inline] -fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { - let mut sum_attributes = 0; - let mut index = 0; - - for group in query_index.linear_group() { - sum_attributes += attribute[index] as usize; - index += group.len(); - } - - sum_attributes -} - -#[derive(Debug, Clone, Copy)] -pub struct SumOfWordsAttribute; - -impl Criterion for SumOfWordsAttribute { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let attribute = lhs.attribute(); - sum_matches_attributes(query_index, attribute) - }; - - let rhs = { - let query_index = rhs.query_index(); - let attribute = rhs.attribute(); - sum_matches_attributes(query_index, attribute) - }; - - lhs.cmp(&rhs) - } - - fn name(&self) -> &str { - "SumOfWordsAttribute" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "soulier" - // - // doc0: { 0. "Soulier bleu", 1. "bla bla bla" } - // doc1: { 0. "Botte rouge", 1. "Soulier en cuir" } - #[test] - fn title_vs_description() { - let query_index0 = &[0]; - let attribute0 = &[0]; - - let query_index1 = &[0]; - let attribute1 = &[1]; - - let doc0 = sum_matches_attributes(query_index0, attribute0); - let doc1 = sum_matches_attributes(query_index1, attribute1); - assert_eq!(doc0.cmp(&doc1), Ordering::Less); - } -} diff --git a/meilisearch-core/src/criterion/sum_of_words_position.rs b/meilisearch-core/src/criterion/sum_of_words_position.rs deleted file mode 100644 index 70b8843dc..000000000 --- a/meilisearch-core/src/criterion/sum_of_words_position.rs +++ /dev/null @@ -1,64 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::Ordering; - -#[inline] -fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { - let mut sum_word_index = 0; - let mut index = 0; - - for group in query_index.linear_group() { - sum_word_index += word_index[index] as usize; - index += group.len(); - } - - sum_word_index -} - -#[derive(Debug, Clone, Copy)] -pub struct SumOfWordsPosition; - -impl Criterion for SumOfWordsPosition { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let word_index = lhs.word_index(); - sum_matches_attribute_index(query_index, word_index) - }; - - let rhs = { - let query_index = rhs.query_index(); - let word_index = rhs.word_index(); - sum_matches_attribute_index(query_index, word_index) - }; - - lhs.cmp(&rhs) - } - - fn name(&self) -> &str { - "SumOfWordsPosition" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "soulier" - // - // doc0: "Soulier bleu" - // doc1: "Botte rouge et soulier noir" - #[test] - fn easy_case() { - let query_index0 = &[0]; - let word_index0 = &[0]; - - let query_index1 = &[0]; - let word_index1 = &[3]; - - let doc0 = sum_matches_attribute_index(query_index0, word_index0); - let doc1 = sum_matches_attribute_index(query_index1, word_index1); - assert_eq!(doc0.cmp(&doc1), Ordering::Less); - } -} diff --git a/meilisearch-core/src/criterion/typo.rs b/meilisearch-core/src/criterion/typo.rs new file mode 100644 index 000000000..d7907700d --- /dev/null +++ b/meilisearch-core/src/criterion/typo.rs @@ -0,0 +1,67 @@ +use std::cmp::Ordering; + +use compact_arena::SmallArena; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; +use crate::RawDocument; + +use super::{Criterion, prepare_query_distances}; + +pub struct Typo; + +impl Criterion for Typo { + fn name(&self) -> &str { "typo" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + prepare_query_distances(documents, query_enhancer, automatons, postings_lists); + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &SmallArena, + ) -> Ordering + { + // This function is a wrong logarithmic 10 function. + // It is safe to panic on input number higher than 3, + // the number of typos is never bigger than that. + #[inline] + fn custom_log10(n: u8) -> f32 { + match n { + 0 => 0.0, // log(1) + 1 => 0.30102, // log(2) + 2 => 0.47712, // log(3) + 3 => 0.60205, // log(4) + _ => panic!("invalid number"), + } + } + + #[inline] + fn compute_typos(distances: &[Option]) -> usize { + let mut number_words: usize = 0; + let mut sum_typos = 0.0; + + for distance in distances { + if let Some(distance) = distance { + sum_typos += custom_log10(*distance); + number_words += 1; + } + } + + (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize + } + + let lhs = compute_typos(&lhs.processed_distances); + let rhs = compute_typos(&rhs.processed_distances); + + lhs.cmp(&rhs).reverse() + } +} diff --git a/meilisearch-core/src/criterion/words.rs b/meilisearch-core/src/criterion/words.rs new file mode 100644 index 000000000..fbe3d9070 --- /dev/null +++ b/meilisearch-core/src/criterion/words.rs @@ -0,0 +1,43 @@ +use std::cmp::Ordering; + +use compact_arena::SmallArena; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; +use crate::RawDocument; + +use super::{Criterion, prepare_query_distances}; + +pub struct Words; + +impl Criterion for Words { + fn name(&self) -> &str { "words" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + prepare_query_distances(documents, query_enhancer, automatons, postings_lists); + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &SmallArena, + ) -> Ordering + { + #[inline] + fn number_of_query_words(distances: &[Option]) -> usize { + distances.iter().cloned().filter(Option::is_some).count() + } + + let lhs = number_of_query_words(&lhs.processed_distances); + let rhs = number_of_query_words(&rhs.processed_distances); + + lhs.cmp(&rhs).reverse() + } +} diff --git a/meilisearch-core/src/criterion/words_position.rs b/meilisearch-core/src/criterion/words_position.rs new file mode 100644 index 000000000..7df3e1fbd --- /dev/null +++ b/meilisearch-core/src/criterion/words_position.rs @@ -0,0 +1,48 @@ +use std::cmp::Ordering; + +use compact_arena::SmallArena; +use slice_group_by::GroupBy; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton}; +use crate::RawDocument; + +use super::{Criterion, prepare_raw_matches}; + +pub struct WordsPosition; + +impl Criterion for WordsPosition { + fn name(&self) -> &str { "words position" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Ordering + { + #[inline] + fn sum_words_position(matches: &[SimpleMatch]) -> usize { + let mut sum_words_position = 0; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_words_position += group[0].word_index as usize; + } + sum_words_position + } + + let lhs = sum_words_position(&lhs.processed_matches); + let rhs = sum_words_position(&rhs.processed_matches); + + lhs.cmp(&rhs) + } +} diff --git a/meilisearch-core/src/criterion/words_proximity.rs b/meilisearch-core/src/criterion/words_proximity.rs deleted file mode 100644 index 579bc7b8c..000000000 --- a/meilisearch-core/src/criterion/words_proximity.rs +++ /dev/null @@ -1,164 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::{self, Ordering}; - -const MAX_DISTANCE: u16 = 8; - -#[inline] -fn clone_tuple((a, b): (&T, &U)) -> (T, U) { - (a.clone(), b.clone()) -} - -fn index_proximity(lhs: u16, rhs: u16) -> u16 { - if lhs < rhs { - cmp::min(rhs - lhs, MAX_DISTANCE) - } else { - cmp::min(lhs - rhs, MAX_DISTANCE) + 1 - } -} - -fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 { - if lattr != rattr { - return MAX_DISTANCE; - } - index_proximity(lwi, rwi) -} - -fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 { - let mut min_prox = u16::max_value(); - - for a in lattr.iter().zip(lwi) { - for b in rattr.iter().zip(rwi) { - let a = clone_tuple(a); - let b = clone_tuple(b); - min_prox = cmp::min(min_prox, attribute_proximity(a, b)); - } - } - - min_prox -} - -fn matches_proximity( - query_index: &[u32], - distance: &[u8], - attribute: &[u16], - word_index: &[u16], -) -> u16 { - let mut query_index_groups = query_index.linear_group(); - let mut proximity = 0; - let mut index = 0; - - let get_attr_wi = |index: usize, group_len: usize| { - // retrieve the first distance group (with the lowest values) - let len = distance[index..index + group_len] - .linear_group() - .next() - .unwrap() - .len(); - - let rattr = &attribute[index..index + len]; - let rwi = &word_index[index..index + len]; - - (rattr, rwi) - }; - - let mut last = query_index_groups.next().map(|group| { - let attr_wi = get_attr_wi(index, group.len()); - index += group.len(); - attr_wi - }); - - // iter by windows of size 2 - while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) { - let attr_wi = get_attr_wi(index, rhs.len()); - proximity += min_proximity(lhs, attr_wi); - last = Some(attr_wi); - index += rhs.len(); - } - - proximity -} - -#[derive(Debug, Clone, Copy)] -pub struct WordsProximity; - -impl Criterion for WordsProximity { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let distance = lhs.distance(); - let attribute = lhs.attribute(); - let word_index = lhs.word_index(); - matches_proximity(query_index, distance, attribute, word_index) - }; - - let rhs = { - let query_index = rhs.query_index(); - let distance = rhs.distance(); - let attribute = rhs.attribute(); - let word_index = rhs.word_index(); - matches_proximity(query_index, distance, attribute, word_index) - }; - - lhs.cmp(&rhs) - } - - fn name(&self) -> &str { - "WordsProximity" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn three_different_attributes() { - // "soup" "of the" "the day" - // - // { id: 0, attr: 0, attr_index: 0 } - // { id: 1, attr: 1, attr_index: 0 } - // { id: 2, attr: 1, attr_index: 1 } - // { id: 2, attr: 2, attr_index: 0 } - // { id: 3, attr: 3, attr_index: 1 } - - let query_index = &[0, 1, 2, 2, 3]; - let distance = &[0, 0, 0, 0, 0]; - let attribute = &[0, 1, 1, 2, 3]; - let word_index = &[0, 0, 1, 0, 1]; - - // soup -> of = 8 - // + of -> the = 1 - // + the -> day = 8 (not 1) - assert_eq!( - matches_proximity(query_index, distance, attribute, word_index), - 17 - ); - } - - #[test] - fn two_different_attributes() { - // "soup day" "soup of the day" - // - // { id: 0, attr: 0, attr_index: 0 } - // { id: 0, attr: 1, attr_index: 0 } - // { id: 1, attr: 1, attr_index: 1 } - // { id: 2, attr: 1, attr_index: 2 } - // { id: 3, attr: 0, attr_index: 1 } - // { id: 3, attr: 1, attr_index: 3 } - - let query_index = &[0, 0, 1, 2, 3, 3]; - let distance = &[0, 0, 0, 0, 0, 0]; - let attribute = &[0, 1, 1, 1, 0, 1]; - let word_index = &[0, 0, 1, 2, 1, 3]; - - // soup -> of = 1 - // + of -> the = 1 - // + the -> day = 1 - assert_eq!( - matches_proximity(query_index, distance, attribute, word_index), - 3 - ); - } -} diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs deleted file mode 100644 index a82dbf123..000000000 --- a/meilisearch-core/src/criterion2.rs +++ /dev/null @@ -1,514 +0,0 @@ -use std::cmp::{self, Ordering, Reverse}; -use std::borrow::Cow; -use std::sync::atomic::{self, AtomicUsize}; - -use slice_group_by::{GroupBy, GroupByMut}; -use compact_arena::SmallArena; -use sdset::{Set, SetBuf}; -use log::debug; - -use crate::{DocIndex, DocumentId}; -use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView, QueryWordAutomaton}; -use crate::automaton::QueryEnhancer; - -type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>; - -pub trait Criterion { - fn name(&self) -> &str; - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ); - - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &PostingsListsArena<'tag, 'txn>, - ) -> Ordering; - - #[inline] - fn eq<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &PostingsListsArena<'tag, 'txn>, - ) -> bool - { - self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal - } -} - -fn prepare_query_distances<'a, 'tag, 'txn>( - documents: &mut [RawDocument<'a, 'tag>], - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - postings_lists: &PostingsListsArena<'tag, 'txn>, -) { - for document in documents { - if !document.processed_distances.is_empty() { continue } - - let mut processed = Vec::new(); - for m in document.raw_matches.iter() { - if postings_lists[m.postings_list].is_empty() { continue } - - let range = query_enhancer.replacement(m.query_index as u32); - let new_len = cmp::max(range.end as usize, processed.len()); - processed.resize(new_len, None); - - for index in range { - let index = index as usize; - processed[index] = match processed[index] { - Some(distance) if distance > m.distance => Some(m.distance), - Some(distance) => Some(distance), - None => Some(m.distance), - }; - } - } - - document.processed_distances = processed; - } -} - -pub struct Typo; - -impl Criterion for Typo { - fn name(&self) -> &str { "typo" } - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - prepare_query_distances(documents, query_enhancer, automatons, postings_lists); - } - - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &PostingsListsArena, - ) -> Ordering - { - // This function is a wrong logarithmic 10 function. - // It is safe to panic on input number higher than 3, - // the number of typos is never bigger than that. - #[inline] - fn custom_log10(n: u8) -> f32 { - match n { - 0 => 0.0, // log(1) - 1 => 0.30102, // log(2) - 2 => 0.47712, // log(3) - 3 => 0.60205, // log(4) - _ => panic!("invalid number"), - } - } - - #[inline] - fn compute_typos(distances: &[Option]) -> usize { - let mut number_words: usize = 0; - let mut sum_typos = 0.0; - - for distance in distances { - if let Some(distance) = distance { - sum_typos += custom_log10(*distance); - number_words += 1; - } - } - - (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize - } - - let lhs = compute_typos(&lhs.processed_distances); - let rhs = compute_typos(&rhs.processed_distances); - - lhs.cmp(&rhs).reverse() - } -} - -pub struct Words; - -impl Criterion for Words { - fn name(&self) -> &str { "words" } - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - prepare_query_distances(documents, query_enhancer, automatons, postings_lists); - } - - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &PostingsListsArena, - ) -> Ordering - { - #[inline] - fn number_of_query_words(distances: &[Option]) -> usize { - distances.iter().cloned().filter(Option::is_some).count() - } - - let lhs = number_of_query_words(&lhs.processed_distances); - let rhs = number_of_query_words(&rhs.processed_distances); - - lhs.cmp(&rhs).reverse() - } -} - -fn prepare_raw_matches<'a, 'tag, 'txn>( - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], -) { - for document in documents { - if !document.processed_matches.is_empty() { continue } - - let mut processed = Vec::new(); - for m in document.raw_matches.iter() { - let postings_list = &postings_lists[m.postings_list]; - processed.reserve(postings_list.len()); - for di in postings_list.as_ref() { - let simple_match = SimpleMatch { - query_index: m.query_index, - distance: m.distance, - attribute: di.attribute, - word_index: di.word_index, - is_exact: m.is_exact, - }; - processed.push(simple_match); - } - } - - let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); - document.processed_matches = processed.into_vec(); - } -} - -pub struct Proximity; - -impl Criterion for Proximity { - fn name(&self) -> &str { "proximity" } - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); - } - - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &PostingsListsArena<'tag, 'txn>, - ) -> Ordering - { - const MAX_DISTANCE: u16 = 8; - - fn index_proximity(lhs: u16, rhs: u16) -> u16 { - if lhs < rhs { - cmp::min(rhs - lhs, MAX_DISTANCE) - } else { - cmp::min(lhs - rhs, MAX_DISTANCE) + 1 - } - } - - fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { - if lhs.attribute != rhs.attribute { MAX_DISTANCE } - else { index_proximity(lhs.word_index, rhs.word_index) } - } - - fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { - let mut min_prox = u16::max_value(); - for a in lhs { - for b in rhs { - let prox = attribute_proximity(*a, *b); - min_prox = cmp::min(min_prox, prox); - } - } - min_prox - } - - fn matches_proximity(matches: &[SimpleMatch],) -> u16 { - let mut proximity = 0; - let mut iter = matches.linear_group_by_key(|m| m.query_index); - - // iterate over groups by windows of size 2 - let mut last = iter.next(); - while let (Some(lhs), Some(rhs)) = (last, iter.next()) { - proximity += min_proximity(lhs, rhs); - last = Some(rhs); - } - - proximity - } - - let lhs = matches_proximity(&lhs.processed_matches); - let rhs = matches_proximity(&rhs.processed_matches); - - lhs.cmp(&rhs) - } -} - -pub struct Attribute; - -impl Criterion for Attribute { - fn name(&self) -> &str { "attribute" } - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); - } - - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &PostingsListsArena<'tag, 'txn>, - ) -> Ordering - { - #[inline] - fn best_attribute(matches: &[SimpleMatch]) -> u16 { - let mut best_attribute = u16::max_value(); - for group in matches.linear_group_by_key(|bm| bm.query_index) { - best_attribute = cmp::min(best_attribute, group[0].attribute); - } - best_attribute - } - - let lhs = best_attribute(&lhs.processed_matches); - let rhs = best_attribute(&rhs.processed_matches); - - lhs.cmp(&rhs) - } -} - -pub struct WordsPosition; - -impl Criterion for WordsPosition { - fn name(&self) -> &str { "words position" } - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); - } - - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &PostingsListsArena<'tag, 'txn>, - ) -> Ordering - { - #[inline] - fn sum_words_position(matches: &[SimpleMatch]) -> usize { - let mut sum_words_position = 0; - for group in matches.linear_group_by_key(|bm| bm.query_index) { - sum_words_position += group[0].word_index as usize; - } - sum_words_position - } - - let lhs = sum_words_position(&lhs.processed_matches); - let rhs = sum_words_position(&rhs.processed_matches); - - lhs.cmp(&rhs) - } -} - -pub struct Exact; - -impl Criterion for Exact { - fn name(&self) -> &str { "exact" } - - fn prepare( - &self, - documents: &mut [RawDocument], - postings_lists: &mut PostingsListsArena, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - for document in documents { - document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); - } - } - - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &PostingsListsArena, - ) -> Ordering - { - #[inline] - fn sum_exact_query_words(matches: &[BareMatch]) -> usize { - let mut sum_exact_query_words = 0; - - for group in matches.linear_group_by_key(|bm| bm.query_index) { - sum_exact_query_words += group[0].is_exact as usize; - } - - sum_exact_query_words - } - - let lhs = sum_exact_query_words(&lhs.raw_matches); - let rhs = sum_exact_query_words(&rhs.raw_matches); - - lhs.cmp(&rhs).reverse() - } -} - -pub struct StableDocId; - -impl Criterion for StableDocId { - fn name(&self) -> &str { "stable document id" } - - fn prepare( - &self, - documents: &mut [RawDocument], - postings_lists: &mut PostingsListsArena, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - // ... - } - - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &PostingsListsArena, - ) -> Ordering - { - let lhs = &lhs.raw_matches[0].document_id; - let rhs = &rhs.raw_matches[0].document_id; - - lhs.cmp(rhs) - } -} - -pub fn multiword_rewrite_matches( - matches: &mut [SimpleMatch], - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], -) -> SetBuf -{ - matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); - - let mut padded_matches = Vec::with_capacity(matches.len()); - - // let before_padding = Instant::now(); - // for each attribute of each document - for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { - // padding will only be applied - // to word indices in the same attribute - let mut padding = 0; - let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); - - // for each match at the same position - // in this document attribute - while let Some(same_word_index) = iter.next() { - // find the biggest padding - let mut biggest = 0; - for match_ in same_word_index { - let mut replacement = query_enhancer.replacement(match_.query_index as u32); - let replacement_len = replacement.len(); - let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); - - if let Some(query_index) = replacement.next() { - let word_index = match_.word_index + padding as u16; - let query_index = query_index as u16; - let match_ = SimpleMatch { query_index, word_index, ..*match_ }; - padded_matches.push(match_); - } - - let mut found = false; - - // look ahead and if there already is a match - // corresponding to this padding word, abort the padding - 'padding: for (x, next_group) in nexts.enumerate() { - for (i, query_index) in replacement.clone().enumerate().skip(x) { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let query_index = query_index as u16; - let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; - - for nmatch_ in next_group { - let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); - let query_index = rep.next().unwrap() as u16; - if query_index == padmatch.query_index { - if !found { - // if we find a corresponding padding for the - // first time we must push preceding paddings - for (i, query_index) in replacement.clone().enumerate().take(i) - { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let query_index = query_index as u16; - let match_ = SimpleMatch { query_index, word_index, ..*match_ }; - padded_matches.push(match_); - biggest = biggest.max(i + 1); - } - } - - padded_matches.push(padmatch); - found = true; - continue 'padding; - } - } - } - - // if we do not find a corresponding padding in the - // next groups so stop here and pad what was found - break; - } - - if !found { - // if no padding was found in the following matches - // we must insert the entire padding - for (i, query_index) in replacement.enumerate() { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let query_index = query_index as u16; - let match_ = SimpleMatch { query_index, word_index, ..*match_ }; - padded_matches.push(match_); - } - - biggest = biggest.max(replacement_len - 1); - } - } - - padding += biggest; - } - } - - // debug!("padding matches took {:.02?}", before_padding.elapsed()); - - // With this check we can see that the loop above takes something - // like 43% of the search time even when no rewrite is needed. - // assert_eq!(before_matches, padded_matches); - - SetBuf::from_dirty(padded_matches) -} diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 3a54168b4..01fb05372 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -20,7 +20,6 @@ mod update; // TODO replace mod bucket_sort; -mod criterion2; pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT}; pub use self::error::{Error, MResult}; @@ -31,62 +30,13 @@ pub use self::store::Index; pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount}; -#[doc(hidden)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct TmpMatch { - pub query_index: u32, - pub distance: u8, - pub attribute: u16, - pub word_index: u16, - pub is_exact: bool, -} - #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Document { pub id: DocumentId, pub highlights: Vec, - #[cfg(test)] - pub matches: Vec, -} - -impl Document { - #[cfg(not(test))] - fn from_raw(raw: RawDocument) -> Document { - Document { - id: raw.id, - highlights: raw.highlights, - } - } - - #[cfg(test)] - fn from_raw(raw: RawDocument) -> Document { - let len = raw.query_index().len(); - let mut matches = Vec::with_capacity(len); - - let query_index = raw.query_index(); - let distance = raw.distance(); - let attribute = raw.attribute(); - let word_index = raw.word_index(); - let is_exact = raw.is_exact(); - - for i in 0..len { - let match_ = TmpMatch { - query_index: query_index[i], - distance: distance[i], - attribute: attribute[i], - word_index: word_index[i], - is_exact: is_exact[i], - }; - matches.push(match_); - } - - Document { - id: raw.id, - matches, - highlights: raw.highlights, - } - } + // #[cfg(test)] + // pub matches: Vec, } #[cfg(test)] diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index c862ae2a2..3a9750ec0 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -1,21 +1,8 @@ -use hashbrown::HashMap; -use std::convert::TryFrom; use std::ops::Range; -use std::rc::Rc; -use std::time::{Duration, Instant}; -use std::{cmp, mem}; - -use fst::{IntoStreamer, Streamer}; -use log::debug; -use sdset::SetBuf; -use slice_group_by::{GroupBy, GroupByMut}; +use std::time::Duration; use crate::{bucket_sort::bucket_sort, database::MainT}; -use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer}; -use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; -use crate::levenshtein::prefix_damerau_levenshtein; -use crate::raw_document::{raw_documents_from, RawDocument}; -use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount}; +use crate::{criterion::Criteria, Document, DocumentId}; use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; pub struct QueryBuilder<'c, 'f, 'd> { @@ -30,292 +17,6 @@ pub struct QueryBuilder<'c, 'f, 'd> { synonyms_store: store::Synonyms, } -fn multiword_rewrite_matches( - mut matches: Vec<(DocumentId, TmpMatch)>, - query_enhancer: &QueryEnhancer, -) -> SetBuf<(DocumentId, TmpMatch)> { - let mut padded_matches = Vec::with_capacity(matches.len()); - - let before_sort = Instant::now(); - // we sort the matches by word index to make them rewritable - matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); - debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); - - let before_padding = Instant::now(); - // for each attribute of each document - for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { - // padding will only be applied - // to word indices in the same attribute - let mut padding = 0; - let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index); - - // for each match at the same position - // in this document attribute - while let Some(same_word_index) = iter.next() { - // find the biggest padding - let mut biggest = 0; - for (id, match_) in same_word_index { - let mut replacement = query_enhancer.replacement(match_.query_index); - let replacement_len = replacement.len(); - let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); - - if let Some(query_index) = replacement.next() { - let word_index = match_.word_index + padding as u16; - let match_ = TmpMatch { - query_index, - word_index, - ..*match_ - }; - padded_matches.push((*id, match_)); - } - - let mut found = false; - - // look ahead and if there already is a match - // corresponding to this padding word, abort the padding - 'padding: for (x, next_group) in nexts.enumerate() { - for (i, query_index) in replacement.clone().enumerate().skip(x) { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let padmatch = TmpMatch { - query_index, - word_index, - ..*match_ - }; - - for (_, nmatch_) in next_group { - let mut rep = query_enhancer.replacement(nmatch_.query_index); - let query_index = rep.next().unwrap(); - if query_index == padmatch.query_index { - if !found { - // if we find a corresponding padding for the - // first time we must push preceding paddings - for (i, query_index) in replacement.clone().enumerate().take(i) - { - let word_index = - match_.word_index + padding as u16 + (i + 1) as u16; - let match_ = TmpMatch { - query_index, - word_index, - ..*match_ - }; - padded_matches.push((*id, match_)); - biggest = biggest.max(i + 1); - } - } - - padded_matches.push((*id, padmatch)); - found = true; - continue 'padding; - } - } - } - - // if we do not find a corresponding padding in the - // next groups so stop here and pad what was found - break; - } - - if !found { - // if no padding was found in the following matches - // we must insert the entire padding - for (i, query_index) in replacement.enumerate() { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let match_ = TmpMatch { - query_index, - word_index, - ..*match_ - }; - padded_matches.push((*id, match_)); - } - - biggest = biggest.max(replacement_len - 1); - } - } - - padding += biggest; - } - } - - for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) { - document_matches.sort_unstable(); - } - - debug!("padding matches took {:.02?}", before_padding.elapsed()); - - // With this check we can see that the loop above takes something - // like 43% of the search time even when no rewrite is needed. - // assert_eq!(before_matches, padded_matches); - - SetBuf::new_unchecked(padded_matches) -} - -fn fetch_raw_documents( - reader: &heed::RoTxn, - automatons_groups: &[AutomatonGroup], - query_enhancer: &QueryEnhancer, - searchables: Option<&ReorderedAttrs>, - main_store: store::Main, - postings_lists_store: store::PostingsLists, -) -> MResult> { - let mut matches = Vec::new(); - let mut highlights = Vec::new(); - - let words = match main_store.words_fst(reader)? { - Some(words) => words, - None => return Ok(Vec::new()), - }; - - let before_automatons_groups_loop = Instant::now(); - let mut doc_indexes_rewrite = Duration::default(); - let mut retrieve_postings_lists = Duration::default(); - let mut stream_reserve = Duration::default(); - let mut covered_area_time = Duration::default(); - let mut eval_time = Duration::default(); - - for group in automatons_groups { - let AutomatonGroup { is_phrase_query, automatons } = group; - let phrase_query_len = automatons.len(); - - let mut tmp_matches = Vec::new(); - for (id, automaton) in automatons.into_iter().enumerate() { - let Automaton { index, is_exact, query_len, query, .. } = automaton; - let dfa = automaton.dfa(); - - let before_stream_loop = Instant::now(); - let mut stream_count = 0; - - let mut stream = words.search(&dfa).into_stream(); - while let Some(input) = stream.next() { - let before_eval_time = Instant::now(); - let distance = dfa.eval(input).to_u8(); - eval_time += before_eval_time.elapsed(); - - let is_exact = *is_exact && distance == 0 && input.len() == *query_len; - - stream_count += 1; - - let before_covered_area = Instant::now(); - let covered_area = if *query_len > input.len() { - input.len() - } else { - prefix_damerau_levenshtein(query.as_bytes(), input).1 - }; - covered_area_time += before_covered_area.elapsed(); - - let before_retrieve_postings_lists = Instant::now(); - let doc_indexes = match postings_lists_store.postings_list(reader, input)? { - Some(doc_indexes) => doc_indexes, - None => continue, - }; - retrieve_postings_lists += before_retrieve_postings_lists.elapsed(); - - let before_stream_reserve = Instant::now(); - tmp_matches.reserve(doc_indexes.len()); - stream_reserve += before_stream_reserve.elapsed(); - - let before_doc_indexes_rewrite = Instant::now(); - for di in doc_indexes.as_ref() { - let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); - if let Some(attribute) = attribute { - let match_ = TmpMatch { - query_index: *index as u32, - distance, - attribute, - word_index: di.word_index, - is_exact, - }; - - let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value()); - let covered_area = cmp::min(covered_area, di.char_length); - - let highlight = Highlight { - attribute: di.attribute, - char_index: di.char_index, - char_length: covered_area, - }; - - tmp_matches.push((di.document_id, id, match_, highlight)); - } - } - doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed(); - } - debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count); - } - - if *is_phrase_query { - tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index)); - for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) { - for window in group.windows(2) { - let (ida, ia, ma, ha) = window[0]; - let (idb, ib, mb, hb) = window[1]; - - debug_assert_eq!(ida, idb); - - // if matches must follow and actually follows themselves - if ia + 1 == ib && ma.word_index + 1 == mb.word_index { - // TODO we must make it work for phrase query longer than 2 - // if the second match is the last phrase query word - if ib + 1 == phrase_query_len { - // insert first match - matches.push((ida, ma)); - highlights.push((ida, ha)); - - // insert second match - matches.push((idb, mb)); - highlights.push((idb, hb)); - } - } - } - } - } else { - let before_rerewrite = Instant::now(); - - matches.reserve(tmp_matches.len()); - highlights.reserve(tmp_matches.len()); - - for (id, _, match_, highlight) in tmp_matches { - matches.push((id, match_)); - highlights.push((id, highlight)); - } - debug!("rerewrite took {:.02?}", before_rerewrite.elapsed()); - } - } - debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed()); - debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite); - debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists); - debug!("stream reserve took {:.02?}", stream_reserve); - debug!("covered area took {:.02?}", covered_area_time); - debug!("eval value took {:.02?}", eval_time); - - // { - // let mut cloned = matches.clone(); - // let before_sort_test = Instant::now(); - // cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance)); - // debug!("sorting test took {:.02?}", before_sort_test.elapsed()); - // } - - let before_multiword_rewrite_matches = Instant::now(); - debug!("number of matches before rewrite {}", matches.len()); - debug!("{:?}", query_enhancer); - let matches = multiword_rewrite_matches(matches, &query_enhancer); - debug!("number of matches after rewrite {}", matches.len()); - debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed()); - - let before_highlight_sorting = Instant::now(); - let highlights = { - highlights.sort_unstable_by_key(|(id, _)| *id); - SetBuf::new_unchecked(highlights) - }; - debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed()); - - let before_raw_documents = Instant::now(); - let raw_documents = raw_documents_from(matches, highlights); - debug!("raw_documents took {:.02?}", before_raw_documents.elapsed()); - debug!("documents to worry about: {}", raw_documents.len()); - - Ok(raw_documents) -} - impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { pub fn new( main: store::Main, @@ -389,7 +90,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { reader, query, range, - // self.criteria, + self.criteria, self.main_store, self.postings_lists_store, self.documents_fields_counts_store, diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 5ba660d11..4096eeaba 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -1,183 +1,89 @@ -use std::fmt; -use std::sync::Arc; - +use compact_arena::SmallArena; +use itertools::EitherOrBoth; use sdset::SetBuf; -use slice_group_by::GroupBy; -use crate::{DocumentId, Highlight, TmpMatch, AttrCount}; +use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView}; -#[derive(Clone)] -pub struct RawDocument { - pub id: DocumentId, - pub matches: SharedMatches, - pub highlights: Vec, - pub fields_counts: Option>, +pub struct RawDocument<'a, 'tag> { + pub id: crate::DocumentId, + pub raw_matches: &'a mut [BareMatch<'tag>], + pub processed_matches: Vec, + /// The list of minimum `distance` found + pub processed_distances: Vec>, } -impl RawDocument { - pub fn query_index(&self) -> &[u32] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { - &self - .matches - .matches - .query_index - .get_unchecked(r.start..r.end) +impl<'a, 'tag> RawDocument<'a, 'tag> { + pub fn new<'txn>( + raw_matches: &'a mut [BareMatch<'tag>], + automatons: &[QueryWordAutomaton], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + ) -> Option> + { + raw_matches.sort_unstable_by_key(|m| m.query_index); + + let mut previous_word = None; + for i in 0..raw_matches.len() { + let a = &raw_matches[i]; + let auta = &automatons[a.query_index as usize]; + + match auta.phrase_query { + Some((0, _)) => { + let b = match raw_matches.get(i + 1) { + Some(b) => b, + None => { + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); + continue; + } + }; + + if a.query_index + 1 != b.query_index { + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); + continue + } + + let pla = &postings_lists[a.postings_list]; + let plb = &postings_lists[b.postings_list]; + + let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { + a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) + }); + + let mut newa = Vec::new(); + let mut newb = Vec::new(); + + for eb in iter { + if let EitherOrBoth::Both(a, b) = eb { + newa.push(*a); + newb.push(*b); + } + } + + if !newa.is_empty() { + previous_word = Some(a.query_index); + } + + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); + postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); + }, + Some((1, _)) => { + if previous_word.take() != Some(a.query_index - 1) { + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); + } + }, + Some((_, _)) => unreachable!(), + None => (), + } } - } - pub fn distance(&self) -> &[u8] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } - } - - pub fn attribute(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } - } - - pub fn word_index(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { - &self - .matches - .matches - .word_index - .get_unchecked(r.start..r.end) + if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { + return None } - } - pub fn is_exact(&self) -> &[bool] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } - } -} - -impl fmt::Debug for RawDocument { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str("RawDocument {\r\n")?; - f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "query_index", - self.query_index() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "distance", - self.distance() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "attribute", - self.attribute() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "word_index", - self.word_index() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "is_exact", - self.is_exact() - ))?; - f.write_str("}")?; - Ok(()) - } -} - -pub fn raw_documents_from( - matches: SetBuf<(DocumentId, TmpMatch)>, - highlights: SetBuf<(DocumentId, Highlight)> -) -> Vec { - let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); - let mut matches2 = Matches::with_capacity(matches.len()); - - let matches = matches.linear_group_by_key(|(id, _)| *id); - let highlights = highlights.linear_group_by_key(|(id, _)| *id); - - for (mgroup, hgroup) in matches.zip(highlights) { - assert_eq!(mgroup[0].0, hgroup[0].0); - - let document_id = mgroup[0].0; - let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); - let end = start + mgroup.len(); - let highlights = hgroup.iter().map(|(_, h)| *h).collect(); - let fields_counts = None; - - docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); - // TODO we could try to keep both data - // - the data oriented one and, - // - the raw one, the one that comes from the arguments of this function - // This way we would be able to only produce data oriented lazily. - // - // For example the default first criterion is `SumOfTypos` - // and just needs the `query_index` and the `distance` fields. - // It would probably be good to avoid wasting time sorting other fields of documents - // that will never ever reach the second criterion. - matches2.extend_from_slice(mgroup); - } - - let matches = Arc::new(matches2); - docs_ranges - .into_iter() - .map(|(id, range, highlights, fields_counts)| { - let matches = SharedMatches { range, matches: matches.clone() }; - RawDocument { id, matches, highlights, fields_counts } + Some(RawDocument { + id: raw_matches[0].document_id, + raw_matches, + processed_matches: Vec::new(), + processed_distances: Vec::new(), }) - .collect() -} - -#[derive(Debug, Copy, Clone)] -struct Range { - start: usize, - end: usize, -} - -#[derive(Clone)] -pub struct SharedMatches { - range: Range, - matches: Arc, -} - -#[derive(Clone)] -struct Matches { - query_index: Vec, - distance: Vec, - attribute: Vec, - word_index: Vec, - is_exact: Vec, -} - -impl Matches { - fn with_capacity(cap: usize) -> Matches { - Matches { - query_index: Vec::with_capacity(cap), - distance: Vec::with_capacity(cap), - attribute: Vec::with_capacity(cap), - word_index: Vec::with_capacity(cap), - is_exact: Vec::with_capacity(cap), - } - } - - fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) { - for (_, match_) in matches { - self.query_index.push(match_.query_index); - self.distance.push(match_.distance); - self.attribute.push(match_.attribute); - self.word_index.push(match_.word_index); - self.is_exact.push(match_.is_exact); - } } } diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index 8079f7168..fb995750d 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -310,11 +310,11 @@ impl<'a> SearchBuilder<'a> { if let Some(ranking_rules_order) = ranking_order { for rule in ranking_rules_order { match rule.as_str() { - "_sum_of_typos" => builder.push(SumOfTypos), - "_number_of_words" => builder.push(NumberOfWords), - "_word_proximity" => builder.push(WordsProximity), - "_sum_of_words_attribute" => builder.push(SumOfWordsAttribute), - "_sum_of_words_position" => builder.push(SumOfWordsPosition), + "_typo" => builder.push(Typo), + "_words" => builder.push(Words), + "_proximity" => builder.push(Proximity), + "_attribute" => builder.push(Attribute), + "_words_position" => builder.push(WordsPosition), "_exact" => builder.push(Exact), _ => { let order = match ranking_rules.get(rule.as_str()) { @@ -340,11 +340,11 @@ impl<'a> SearchBuilder<'a> { builder.push(DocumentId); return Ok(Some(builder.build())); } else { - builder.push(SumOfTypos); - builder.push(NumberOfWords); - builder.push(WordsProximity); - builder.push(SumOfWordsAttribute); - builder.push(SumOfWordsPosition); + builder.push(Typo); + builder.push(Words); + builder.push(Proximity); + builder.push(Attribute); + builder.push(WordsPosition); builder.push(Exact); for (rule, order) in ranking_rules.iter() { let custom_ranking = match order {