mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-02-21 18:15:29 +08:00
Before improving fields AttrCount
Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion
This commit is contained in:
parent
11f3d7782d
commit
ef6a4db182
@ -2,7 +2,7 @@ mod dfa;
|
|||||||
mod query_enhancer;
|
mod query_enhancer;
|
||||||
|
|
||||||
use std::cmp::Reverse;
|
use std::cmp::Reverse;
|
||||||
use std::{cmp, vec};
|
use std::{cmp, fmt, vec};
|
||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use levenshtein_automata::DFA;
|
use levenshtein_automata::DFA;
|
||||||
@ -68,7 +68,6 @@ impl AutomatonGroup {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct Automaton {
|
pub struct Automaton {
|
||||||
pub index: usize,
|
pub index: usize,
|
||||||
pub ngram: usize,
|
pub ngram: usize,
|
||||||
@ -78,6 +77,14 @@ pub struct Automaton {
|
|||||||
pub query: String,
|
pub query: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for Automaton {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
f.debug_struct("Automaton")
|
||||||
|
.field("query", &self.query)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Automaton {
|
impl Automaton {
|
||||||
pub fn dfa(&self) -> DFA {
|
pub fn dfa(&self) -> DFA {
|
||||||
if self.is_prefix {
|
if self.is_prefix {
|
||||||
|
@ -1,18 +1,17 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
use meilisearch_schema::SchemaAttr;
|
|
||||||
use sdset::Set;
|
use sdset::Set;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::RawDocument;
|
use crate::{AttrCount, RawDocument};
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn number_exact_matches(
|
fn number_exact_matches(
|
||||||
query_index: &[u32],
|
query_index: &[u32],
|
||||||
attribute: &[u16],
|
attribute: &[u16],
|
||||||
is_exact: &[bool],
|
is_exact: &[bool],
|
||||||
fields_counts: &Set<(SchemaAttr, u16)>,
|
fields_counts: &Set<AttrCount>,
|
||||||
) -> usize {
|
) -> usize {
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let mut index = 0;
|
let mut index = 0;
|
||||||
@ -25,8 +24,8 @@ fn number_exact_matches(
|
|||||||
if *is_exact {
|
if *is_exact {
|
||||||
found_exact = true;
|
found_exact = true;
|
||||||
let attr = &attribute[index + pos];
|
let attr = &attribute[index + pos];
|
||||||
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
|
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) {
|
||||||
let (_, count) = fields_counts[pos];
|
let AttrCount { count, .. } = fields_counts[pos];
|
||||||
if count == 1 {
|
if count == 1 {
|
||||||
return usize::max_value();
|
return usize::max_value();
|
||||||
}
|
}
|
||||||
@ -50,7 +49,7 @@ impl Criterion for Exact {
|
|||||||
let query_index = lhs.query_index();
|
let query_index = lhs.query_index();
|
||||||
let is_exact = lhs.is_exact();
|
let is_exact = lhs.is_exact();
|
||||||
let attribute = lhs.attribute();
|
let attribute = lhs.attribute();
|
||||||
let fields_counts = &lhs.fields_counts;
|
let fields_counts = lhs.fields_counts.as_ref().unwrap();
|
||||||
|
|
||||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
};
|
};
|
||||||
@ -59,7 +58,7 @@ impl Criterion for Exact {
|
|||||||
let query_index = rhs.query_index();
|
let query_index = rhs.query_index();
|
||||||
let is_exact = rhs.is_exact();
|
let is_exact = rhs.is_exact();
|
||||||
let attribute = rhs.attribute();
|
let attribute = rhs.attribute();
|
||||||
let fields_counts = &rhs.fields_counts;
|
let fields_counts = rhs.fields_counts.as_ref().unwrap();
|
||||||
|
|
||||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
};
|
};
|
||||||
@ -86,7 +85,7 @@ mod tests {
|
|||||||
let query_index = &[0];
|
let query_index = &[0];
|
||||||
let attribute = &[0];
|
let attribute = &[0];
|
||||||
let is_exact = &[true];
|
let is_exact = &[true];
|
||||||
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap();
|
||||||
|
|
||||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
};
|
};
|
||||||
@ -95,7 +94,7 @@ mod tests {
|
|||||||
let query_index = &[0];
|
let query_index = &[0];
|
||||||
let attribute = &[0];
|
let attribute = &[0];
|
||||||
let is_exact = &[false];
|
let is_exact = &[false];
|
||||||
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap();
|
||||||
|
|
||||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
};
|
};
|
||||||
@ -113,7 +112,7 @@ mod tests {
|
|||||||
let query_index = &[0];
|
let query_index = &[0];
|
||||||
let attribute = &[0];
|
let attribute = &[0];
|
||||||
let is_exact = &[true];
|
let is_exact = &[true];
|
||||||
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
|
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap();
|
||||||
|
|
||||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
};
|
};
|
||||||
@ -122,7 +121,7 @@ mod tests {
|
|||||||
let query_index = &[0];
|
let query_index = &[0];
|
||||||
let attribute = &[0];
|
let attribute = &[0];
|
||||||
let is_exact = &[true];
|
let is_exact = &[true];
|
||||||
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
|
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap();
|
||||||
|
|
||||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||||
};
|
};
|
||||||
|
@ -6,6 +6,7 @@ use std::time::{Duration, Instant};
|
|||||||
use std::{cmp, mem};
|
use std::{cmp, mem};
|
||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
|
use log::debug;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use slice_group_by::{GroupBy, GroupByMut};
|
use slice_group_by::{GroupBy, GroupByMut};
|
||||||
|
|
||||||
@ -14,7 +15,7 @@ use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhanc
|
|||||||
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||||
use crate::levenshtein::prefix_damerau_levenshtein;
|
use crate::levenshtein::prefix_damerau_levenshtein;
|
||||||
use crate::raw_document::{raw_documents_from, RawDocument};
|
use crate::raw_document::{raw_documents_from, RawDocument};
|
||||||
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
|
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount};
|
||||||
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
|
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
|
||||||
|
|
||||||
pub struct QueryBuilder<'c, 'f, 'd> {
|
pub struct QueryBuilder<'c, 'f, 'd> {
|
||||||
@ -146,27 +147,18 @@ fn fetch_raw_documents(
|
|||||||
searchables: Option<&ReorderedAttrs>,
|
searchables: Option<&ReorderedAttrs>,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
postings_lists_store: store::PostingsLists,
|
postings_lists_store: store::PostingsLists,
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
|
||||||
) -> MResult<Vec<RawDocument>> {
|
) -> MResult<Vec<RawDocument>> {
|
||||||
let mut matches = Vec::new();
|
let mut matches = Vec::new();
|
||||||
let mut highlights = Vec::new();
|
let mut highlights = Vec::new();
|
||||||
|
|
||||||
|
let before_automatons_groups_loop = Instant::now();
|
||||||
for group in automatons_groups {
|
for group in automatons_groups {
|
||||||
let AutomatonGroup {
|
let AutomatonGroup { is_phrase_query, automatons } = group;
|
||||||
is_phrase_query,
|
|
||||||
automatons,
|
|
||||||
} = group;
|
|
||||||
let phrase_query_len = automatons.len();
|
let phrase_query_len = automatons.len();
|
||||||
|
|
||||||
let mut tmp_matches = Vec::new();
|
let mut tmp_matches = Vec::new();
|
||||||
for (id, automaton) in automatons.into_iter().enumerate() {
|
for (id, automaton) in automatons.into_iter().enumerate() {
|
||||||
let Automaton {
|
let Automaton { index, is_exact, query_len, query, .. } = automaton;
|
||||||
index,
|
|
||||||
is_exact,
|
|
||||||
query_len,
|
|
||||||
query,
|
|
||||||
..
|
|
||||||
} = automaton;
|
|
||||||
let dfa = automaton.dfa();
|
let dfa = automaton.dfa();
|
||||||
|
|
||||||
let words = match main_store.words_fst(reader)? {
|
let words = match main_store.words_fst(reader)? {
|
||||||
@ -250,26 +242,26 @@ fn fetch_raw_documents(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed());
|
||||||
|
|
||||||
|
let before_multiword_rewrite_matches = Instant::now();
|
||||||
let matches = multiword_rewrite_matches(matches, &query_enhancer);
|
let matches = multiword_rewrite_matches(matches, &query_enhancer);
|
||||||
|
debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed());
|
||||||
|
|
||||||
|
let before_highlight_sorting = Instant::now();
|
||||||
let highlights = {
|
let highlights = {
|
||||||
highlights.sort_unstable_by_key(|(id, _)| *id);
|
highlights.sort_unstable_by_key(|(id, _)| *id);
|
||||||
SetBuf::new_unchecked(highlights)
|
SetBuf::new_unchecked(highlights)
|
||||||
};
|
};
|
||||||
|
debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed());
|
||||||
|
|
||||||
let fields_counts = {
|
|
||||||
let mut fields_counts = Vec::new();
|
|
||||||
for group in matches.linear_group_by_key(|(id, ..)| *id) {
|
|
||||||
let id = group[0].0;
|
|
||||||
for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
|
|
||||||
let (attr, count) = result?;
|
|
||||||
fields_counts.push((id, attr, count));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
SetBuf::new(fields_counts).unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(raw_documents_from(matches, highlights, fields_counts))
|
let before_raw_documents = Instant::now();
|
||||||
|
let raw_documents = raw_documents_from(matches, highlights);
|
||||||
|
debug!("raw_documents took {:.02?}", before_raw_documents.elapsed());
|
||||||
|
debug!("documents to worry about: {}", raw_documents.len());
|
||||||
|
|
||||||
|
Ok(raw_documents)
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||||
@ -434,6 +426,11 @@ where
|
|||||||
for auts in automaton_producer {
|
for auts in automaton_producer {
|
||||||
automatons.push(auts);
|
automatons.push(auts);
|
||||||
|
|
||||||
|
for (i, group) in automatons.iter().enumerate() {
|
||||||
|
debug!("group {} automatons {:?}", i, group.automatons);
|
||||||
|
}
|
||||||
|
|
||||||
|
let before_fetch_raw_documents = Instant::now();
|
||||||
// we must retrieve the documents associated
|
// we must retrieve the documents associated
|
||||||
// with the current automatons
|
// with the current automatons
|
||||||
let mut raw_documents = fetch_raw_documents(
|
let mut raw_documents = fetch_raw_documents(
|
||||||
@ -443,8 +440,8 @@ where
|
|||||||
searchable_attrs.as_ref(),
|
searchable_attrs.as_ref(),
|
||||||
main_store,
|
main_store,
|
||||||
postings_lists_store,
|
postings_lists_store,
|
||||||
documents_fields_counts_store,
|
|
||||||
)?;
|
)?;
|
||||||
|
debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed());
|
||||||
|
|
||||||
// stop processing when time is running out
|
// stop processing when time is running out
|
||||||
if let Some(timeout) = timeout {
|
if let Some(timeout) = timeout {
|
||||||
@ -468,6 +465,20 @@ where
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// we must pull the fields counts of these documents
|
||||||
|
// TODO it would be great to had a "dependency" thing for each criterion
|
||||||
|
// and make it so that we can be lazy on pulling/computing some data.
|
||||||
|
if criterion.name() == "Exact" {
|
||||||
|
for document in group.iter_mut() {
|
||||||
|
let mut fields_counts = Vec::new();
|
||||||
|
for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? {
|
||||||
|
let (attr, count) = result?;
|
||||||
|
fields_counts.push(AttrCount { attr: attr.0, count });
|
||||||
|
}
|
||||||
|
document.fields_counts = Some(SetBuf::new(fields_counts).unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||||
|
|
||||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||||
@ -561,7 +572,6 @@ where
|
|||||||
searchable_attrs.as_ref(),
|
searchable_attrs.as_ref(),
|
||||||
main_store,
|
main_store,
|
||||||
postings_lists_store,
|
postings_lists_store,
|
||||||
documents_fields_counts_store,
|
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// stop processing when time is running out
|
// stop processing when time is running out
|
||||||
|
@ -1,18 +1,18 @@
|
|||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use meilisearch_schema::SchemaAttr;
|
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
use log::debug;
|
||||||
|
|
||||||
use crate::{DocumentId, Highlight, TmpMatch};
|
use crate::{DocumentId, Highlight, TmpMatch, AttrCount};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct RawDocument {
|
pub struct RawDocument {
|
||||||
pub id: DocumentId,
|
pub id: DocumentId,
|
||||||
pub matches: SharedMatches,
|
pub matches: SharedMatches,
|
||||||
pub highlights: Vec<Highlight>,
|
pub highlights: Vec<Highlight>,
|
||||||
pub fields_counts: SetBuf<(SchemaAttr, u16)>,
|
pub fields_counts: Option<SetBuf<AttrCount>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RawDocument {
|
impl RawDocument {
|
||||||
@ -100,44 +100,47 @@ impl fmt::Debug for RawDocument {
|
|||||||
|
|
||||||
pub fn raw_documents_from(
|
pub fn raw_documents_from(
|
||||||
matches: SetBuf<(DocumentId, TmpMatch)>,
|
matches: SetBuf<(DocumentId, TmpMatch)>,
|
||||||
highlights: SetBuf<(DocumentId, Highlight)>,
|
highlights: SetBuf<(DocumentId, Highlight)>
|
||||||
fields_counts: SetBuf<(DocumentId, SchemaAttr, u16)>,
|
|
||||||
) -> Vec<RawDocument> {
|
) -> Vec<RawDocument> {
|
||||||
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
|
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
|
||||||
let mut matches2 = Matches::with_capacity(matches.len());
|
let mut matches2 = Matches::with_capacity(matches.len());
|
||||||
|
|
||||||
let matches = matches.linear_group_by_key(|(id, _)| *id);
|
let matches = matches.linear_group_by_key(|(id, _)| *id);
|
||||||
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
|
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
|
||||||
let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
|
|
||||||
|
|
||||||
for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
|
let mut loops_count = 0;
|
||||||
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
|
|
||||||
debug_assert_eq!(mgroup[0].0, fgroup[0].0);
|
for (mgroup, hgroup) in matches.zip(highlights) {
|
||||||
|
loops_count += 1;
|
||||||
|
assert_eq!(mgroup[0].0, hgroup[0].0);
|
||||||
|
|
||||||
let document_id = mgroup[0].0;
|
let document_id = mgroup[0].0;
|
||||||
let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
|
let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
|
||||||
let end = start + mgroup.len();
|
let end = start + mgroup.len();
|
||||||
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
|
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
|
||||||
let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
|
let fields_counts = None;
|
||||||
|
|
||||||
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
|
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
|
||||||
|
// TODO we could try to keep both data
|
||||||
|
// - the data oriented one and the raw one,
|
||||||
|
// - the one that comes from the arguments of this function
|
||||||
|
// This way we would be able to only produce data oriented lazily.
|
||||||
|
//
|
||||||
|
// For example the default first criterion is `SumOfTypos`
|
||||||
|
// and just needs the `query_index` and the `distance` fields.
|
||||||
|
// It would probably be good to avoid wasting time sorting other fields of documents
|
||||||
|
// that will never ever reach the second criterion.
|
||||||
matches2.extend_from_slice(mgroup);
|
matches2.extend_from_slice(mgroup);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
debug!("loops_counts number is {}", loops_count);
|
||||||
|
|
||||||
let matches = Arc::new(matches2);
|
let matches = Arc::new(matches2);
|
||||||
docs_ranges
|
docs_ranges
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(id, range, highlights, fields_counts)| {
|
.map(|(id, range, highlights, fields_counts)| {
|
||||||
let matches = SharedMatches {
|
let matches = SharedMatches { range, matches: matches.clone() };
|
||||||
range,
|
RawDocument { id, matches, highlights, fields_counts }
|
||||||
matches: matches.clone(),
|
|
||||||
};
|
|
||||||
RawDocument {
|
|
||||||
id,
|
|
||||||
matches,
|
|
||||||
highlights,
|
|
||||||
fields_counts,
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user