diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 9b9e5ab44..5413db17f 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -39,7 +39,6 @@ pub fn bucket_sort<'c>( synonyms_store: store::Synonyms, ) -> MResult> { - // let automatons = construct_automatons(query); let (automatons, query_enhancer) = construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; @@ -286,14 +285,11 @@ impl<'txn> PostingsListView<'txn> { } pub fn rewrite_with(&mut self, postings_list: SetBuf) { - *self = match self { - PostingsListView::Original { input, .. } => { - PostingsListView::Rewritten { input: input.clone(), postings_list } - }, - PostingsListView::Rewritten { input, .. } => { - PostingsListView::Rewritten { input: input.clone(), postings_list } - }, + let input = match self { + PostingsListView::Original { input, .. } => input.clone(), + PostingsListView::Rewritten { input, .. } => input.clone(), }; + *self = PostingsListView::rewritten(input, postings_list); } pub fn len(&self) -> usize { @@ -565,7 +561,8 @@ fn construct_automatons2( } } - if true && n == 1 { + if n == 1 { + // automatons for splitted words if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { let mut left_automaton = QueryWordAutomaton::exact(left); left_automaton.phrase_query = Some((0, 2)); diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 7edda5294..c862ae2a2 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -399,346 +399,6 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { } } -fn raw_query<'c, FI>( - reader: &heed::RoTxn, - - query: &str, - range: Range, - - filter: Option, - timeout: Option, - - criteria: Criteria<'c>, - searchable_attrs: Option, - - main_store: store::Main, - postings_lists_store: store::PostingsLists, - documents_fields_counts_store: store::DocumentsFieldsCounts, - synonyms_store: store::Synonyms, -) -> MResult> -where - FI: Fn(DocumentId) -> bool, -{ - // We delegate the filter work to the distinct query builder, - // specifying a distinct rule that has no effect. - if filter.is_some() { - let distinct = |_| None; - let distinct_size = 1; - return raw_query_with_distinct( - reader, - query, - range, - filter, - distinct, - distinct_size, - timeout, - criteria, - searchable_attrs, - main_store, - postings_lists_store, - documents_fields_counts_store, - synonyms_store, - ); - } - - let start_processing = Instant::now(); - let mut raw_documents_processed = Vec::with_capacity(range.len()); - - let (automaton_producer, query_enhancer) = AutomatonProducer::new( - reader, - query, - main_store, - postings_lists_store, - synonyms_store, - )?; - - let automaton_producer = automaton_producer.into_iter(); - let mut automatons = Vec::new(); - - // aggregate automatons groups by groups after time - for auts in automaton_producer { - automatons.push(auts); - - for (i, group) in automatons.iter().enumerate() { - debug!("group {} automatons {:?}", i, group.automatons); - } - - let before_fetch_raw_documents = Instant::now(); - // we must retrieve the documents associated - // with the current automatons - let mut raw_documents = fetch_raw_documents( - reader, - &automatons, - &query_enhancer, - searchable_attrs.as_ref(), - main_store, - postings_lists_store, - )?; - debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed()); - - // stop processing when time is running out - if let Some(timeout) = timeout { - if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout { - break; - } - } - - let before_bucket_sort = Instant::now(); - - let mut groups = vec![raw_documents.as_mut_slice()]; - - 'criteria: for criterion in criteria.as_ref() { - let tmp_groups = mem::replace(&mut groups, Vec::new()); - let mut documents_seen = 0; - - for group in tmp_groups { - // if this group does not overlap with the requested range, - // push it without sorting and splitting it - if documents_seen + group.len() < range.start { - documents_seen += group.len(); - groups.push(group); - continue; - } - - // we must pull the fields counts of these documents - // TODO it would be great to had a "dependency" thing for each criterion - // and make it so that we can be lazy on pulling/computing some data. - if criterion.name() == "Exact" { - for document in group.iter_mut() { - let mut fields_counts = Vec::new(); - for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? { - let (attr, count) = result?; - fields_counts.push(AttrCount { attr: attr.0, count }); - } - document.fields_counts = Some(SetBuf::new(fields_counts).unwrap()); - } - } - - - group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); - - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { - debug!("criterion {} produced a group of size {}", criterion.name(), group.len()); - - documents_seen += group.len(); - groups.push(group); - - - // we have sort enough documents if the last document sorted is after - // the end of the requested range, we can continue to the next criterion - if documents_seen >= range.end { - continue 'criteria; - } - } - } - } - - debug!("bucket_sort took {:.02?}", before_bucket_sort.elapsed()); - - // once we classified the documents related to the current - // automatons we save that as the next valid result - let iter = raw_documents - .into_iter() - .skip(range.start) - .take(range.len()); - raw_documents_processed.clear(); - raw_documents_processed.extend(iter); - - // stop processing when time is running out - if let Some(timeout) = timeout { - if start_processing.elapsed() > timeout { - break; - } - } - } - - // make real documents now that we know - // those must be returned - let documents = raw_documents_processed - .into_iter() - .map(Document::from_raw) - .collect(); - - Ok(documents) -} - -fn raw_query_with_distinct<'c, FI, FD>( - reader: &heed::RoTxn, - - query: &str, - range: Range, - - filter: Option, - - distinct: FD, - distinct_size: usize, - timeout: Option, - - criteria: Criteria<'c>, - searchable_attrs: Option, - - main_store: store::Main, - postings_lists_store: store::PostingsLists, - documents_fields_counts_store: store::DocumentsFieldsCounts, - synonyms_store: store::Synonyms, -) -> MResult> -where - FI: Fn(DocumentId) -> bool, - FD: Fn(DocumentId) -> Option, -{ - let start_processing = Instant::now(); - let mut raw_documents_processed = Vec::new(); - - let (automaton_producer, query_enhancer) = AutomatonProducer::new( - reader, - query, - main_store, - postings_lists_store, - synonyms_store, - )?; - - let automaton_producer = automaton_producer.into_iter(); - let mut automatons = Vec::new(); - - // aggregate automatons groups by groups after time - for auts in automaton_producer { - automatons.push(auts); - - // we must retrieve the documents associated - // with the current automatons - let mut raw_documents = fetch_raw_documents( - reader, - &automatons, - &query_enhancer, - searchable_attrs.as_ref(), - main_store, - postings_lists_store, - )?; - - // stop processing when time is running out - if let Some(timeout) = timeout { - if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout { - break; - } - } - - let mut groups = vec![raw_documents.as_mut_slice()]; - let mut key_cache = HashMap::new(); - - let mut filter_map = HashMap::new(); - // these two variables informs on the current distinct map and - // on the raw offset of the start of the group where the - // range.start bound is located according to the distinct function - let mut distinct_map = DistinctMap::new(distinct_size); - let mut distinct_raw_offset = 0; - - 'criteria: for criterion in criteria.as_ref() { - let tmp_groups = mem::replace(&mut groups, Vec::new()); - let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); - let mut documents_seen = 0; - - for group in tmp_groups { - // if this group does not overlap with the requested range, - // push it without sorting and splitting it - if documents_seen + group.len() < distinct_raw_offset { - documents_seen += group.len(); - groups.push(group); - continue; - } - - group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); - - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { - // we must compute the real distinguished len of this sub-group - for document in group.iter() { - let filter_accepted = match &filter { - Some(filter) => { - let entry = filter_map.entry(document.id); - *entry.or_insert_with(|| (filter)(document.id)) - } - None => true, - }; - - if filter_accepted { - let entry = key_cache.entry(document.id); - let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new)); - - match key.clone() { - Some(key) => buf_distinct.register(key), - None => buf_distinct.register_without_key(), - }; - } - - // the requested range end is reached: stop computing distinct - if buf_distinct.len() >= range.end { - break; - } - } - - documents_seen += group.len(); - groups.push(group); - - // if this sub-group does not overlap with the requested range - // we must update the distinct map and its start index - if buf_distinct.len() < range.start { - buf_distinct.transfert_to_internal(); - distinct_raw_offset = documents_seen; - } - - // we have sort enough documents if the last document sorted is after - // the end of the requested range, we can continue to the next criterion - if buf_distinct.len() >= range.end { - continue 'criteria; - } - } - } - } - - // once we classified the documents related to the current - // automatons we save that as the next valid result - let mut seen = BufferedDistinctMap::new(&mut distinct_map); - raw_documents_processed.clear(); - - for document in raw_documents.into_iter().skip(distinct_raw_offset) { - let filter_accepted = match &filter { - Some(_) => filter_map.remove(&document.id).unwrap(), - None => true, - }; - - if filter_accepted { - let key = key_cache.remove(&document.id).unwrap(); - let distinct_accepted = match key { - Some(key) => seen.register(key), - None => seen.register_without_key(), - }; - - if distinct_accepted && seen.len() > range.start { - raw_documents_processed.push(document); - if raw_documents_processed.len() == range.len() { - break; - } - } - } - } - - // stop processing when time is running out - if let Some(timeout) = timeout { - if start_processing.elapsed() > timeout { - break; - } - } - } - - // make real documents now that we know - // those must be returned - let documents = raw_documents_processed - .into_iter() - .map(Document::from_raw) - .collect(); - - Ok(documents) -} - #[cfg(test)] mod tests { use super::*;