Remove the raw_query functions

This commit is contained in:
Clément Renault 2019-12-11 15:34:30 +01:00
parent efc2be0b7b
commit ea148575cf
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
2 changed files with 6 additions and 349 deletions

View File

@ -39,7 +39,6 @@ pub fn bucket_sort<'c>(
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>> ) -> MResult<Vec<Document>>
{ {
// let automatons = construct_automatons(query);
let (automatons, query_enhancer) = let (automatons, query_enhancer) =
construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?;
@ -286,14 +285,11 @@ impl<'txn> PostingsListView<'txn> {
} }
pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) { pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
*self = match self { let input = match self {
PostingsListView::Original { input, .. } => { PostingsListView::Original { input, .. } => input.clone(),
PostingsListView::Rewritten { input: input.clone(), postings_list } PostingsListView::Rewritten { input, .. } => input.clone(),
},
PostingsListView::Rewritten { input, .. } => {
PostingsListView::Rewritten { input: input.clone(), postings_list }
},
}; };
*self = PostingsListView::rewritten(input, postings_list);
} }
pub fn len(&self) -> usize { pub fn len(&self) -> usize {
@ -565,7 +561,8 @@ fn construct_automatons2(
} }
} }
if true && n == 1 { if n == 1 {
// automatons for splitted words
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
let mut left_automaton = QueryWordAutomaton::exact(left); let mut left_automaton = QueryWordAutomaton::exact(left);
left_automaton.phrase_query = Some((0, 2)); left_automaton.phrase_query = Some((0, 2));

View File

@ -399,346 +399,6 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
} }
} }
fn raw_query<'c, FI>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
filter: Option<FI>,
timeout: Option<Duration>,
criteria: Criteria<'c>,
searchable_attrs: Option<ReorderedAttrs>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>>
where
FI: Fn(DocumentId) -> bool,
{
// We delegate the filter work to the distinct query builder,
// specifying a distinct rule that has no effect.
if filter.is_some() {
let distinct = |_| None;
let distinct_size = 1;
return raw_query_with_distinct(
reader,
query,
range,
filter,
distinct,
distinct_size,
timeout,
criteria,
searchable_attrs,
main_store,
postings_lists_store,
documents_fields_counts_store,
synonyms_store,
);
}
let start_processing = Instant::now();
let mut raw_documents_processed = Vec::with_capacity(range.len());
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
reader,
query,
main_store,
postings_lists_store,
synonyms_store,
)?;
let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new();
// aggregate automatons groups by groups after time
for auts in automaton_producer {
automatons.push(auts);
for (i, group) in automatons.iter().enumerate() {
debug!("group {} automatons {:?}", i, group.automatons);
}
let before_fetch_raw_documents = Instant::now();
// we must retrieve the documents associated
// with the current automatons
let mut raw_documents = fetch_raw_documents(
reader,
&automatons,
&query_enhancer,
searchable_attrs.as_ref(),
main_store,
postings_lists_store,
)?;
debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed());
// stop processing when time is running out
if let Some(timeout) = timeout {
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
break;
}
}
let before_bucket_sort = Instant::now();
let mut groups = vec![raw_documents.as_mut_slice()];
'criteria: for criterion in criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
for group in tmp_groups {
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < range.start {
documents_seen += group.len();
groups.push(group);
continue;
}
// we must pull the fields counts of these documents
// TODO it would be great to had a "dependency" thing for each criterion
// and make it so that we can be lazy on pulling/computing some data.
if criterion.name() == "Exact" {
for document in group.iter_mut() {
let mut fields_counts = Vec::new();
for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? {
let (attr, count) = result?;
fields_counts.push(AttrCount { attr: attr.0, count });
}
document.fields_counts = Some(SetBuf::new(fields_counts).unwrap());
}
}
group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
debug!("criterion {} produced a group of size {}", criterion.name(), group.len());
documents_seen += group.len();
groups.push(group);
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end {
continue 'criteria;
}
}
}
}
debug!("bucket_sort took {:.02?}", before_bucket_sort.elapsed());
// once we classified the documents related to the current
// automatons we save that as the next valid result
let iter = raw_documents
.into_iter()
.skip(range.start)
.take(range.len());
raw_documents_processed.clear();
raw_documents_processed.extend(iter);
// stop processing when time is running out
if let Some(timeout) = timeout {
if start_processing.elapsed() > timeout {
break;
}
}
}
// make real documents now that we know
// those must be returned
let documents = raw_documents_processed
.into_iter()
.map(Document::from_raw)
.collect();
Ok(documents)
}
fn raw_query_with_distinct<'c, FI, FD>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
filter: Option<FI>,
distinct: FD,
distinct_size: usize,
timeout: Option<Duration>,
criteria: Criteria<'c>,
searchable_attrs: Option<ReorderedAttrs>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>>
where
FI: Fn(DocumentId) -> bool,
FD: Fn(DocumentId) -> Option<u64>,
{
let start_processing = Instant::now();
let mut raw_documents_processed = Vec::new();
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
reader,
query,
main_store,
postings_lists_store,
synonyms_store,
)?;
let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new();
// aggregate automatons groups by groups after time
for auts in automaton_producer {
automatons.push(auts);
// we must retrieve the documents associated
// with the current automatons
let mut raw_documents = fetch_raw_documents(
reader,
&automatons,
&query_enhancer,
searchable_attrs.as_ref(),
main_store,
postings_lists_store,
)?;
// stop processing when time is running out
if let Some(timeout) = timeout {
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
break;
}
}
let mut groups = vec![raw_documents.as_mut_slice()];
let mut key_cache = HashMap::new();
let mut filter_map = HashMap::new();
// these two variables informs on the current distinct map and
// on the raw offset of the start of the group where the
// range.start bound is located according to the distinct function
let mut distinct_map = DistinctMap::new(distinct_size);
let mut distinct_raw_offset = 0;
'criteria: for criterion in criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
let mut documents_seen = 0;
for group in tmp_groups {
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < distinct_raw_offset {
documents_seen += group.len();
groups.push(group);
continue;
}
group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
// we must compute the real distinguished len of this sub-group
for document in group.iter() {
let filter_accepted = match &filter {
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id))
}
None => true,
};
if filter_accepted {
let entry = key_cache.entry(document.id);
let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
match key.clone() {
Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(),
};
}
// the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end {
break;
}
}
documents_seen += group.len();
groups.push(group);
// if this sub-group does not overlap with the requested range
// we must update the distinct map and its start index
if buf_distinct.len() < range.start {
buf_distinct.transfert_to_internal();
distinct_raw_offset = documents_seen;
}
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if buf_distinct.len() >= range.end {
continue 'criteria;
}
}
}
}
// once we classified the documents related to the current
// automatons we save that as the next valid result
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
raw_documents_processed.clear();
for document in raw_documents.into_iter().skip(distinct_raw_offset) {
let filter_accepted = match &filter {
Some(_) => filter_map.remove(&document.id).unwrap(),
None => true,
};
if filter_accepted {
let key = key_cache.remove(&document.id).unwrap();
let distinct_accepted = match key {
Some(key) => seen.register(key),
None => seen.register_without_key(),
};
if distinct_accepted && seen.len() > range.start {
raw_documents_processed.push(document);
if raw_documents_processed.len() == range.len() {
break;
}
}
}
}
// stop processing when time is running out
if let Some(timeout) = timeout {
if start_processing.elapsed() > timeout {
break;
}
}
}
// make real documents now that we know
// those must be returned
let documents = raw_documents_processed
.into_iter()
.map(Document::from_raw)
.collect();
Ok(documents)
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;