mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 17:11:15 +08:00
wip: Make the new query tree work with the criteria
This commit is contained in:
parent
da8abebfa2
commit
8acbdcbbad
@ -1,5 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
use std::convert::TryFrom;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
@ -10,7 +11,6 @@ use std::{cmp, fmt};
|
|||||||
|
|
||||||
use compact_arena::{SmallArena, Idx32, mk_arena};
|
use compact_arena::{SmallArena, Idx32, mk_arena};
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use hashbrown::HashMap;
|
|
||||||
use levenshtein_automata::DFA;
|
use levenshtein_automata::DFA;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
||||||
@ -49,36 +49,6 @@ pub fn bucket_sort<'c, FI>(
|
|||||||
where
|
where
|
||||||
FI: Fn(DocumentId) -> bool,
|
FI: Fn(DocumentId) -> bool,
|
||||||
{
|
{
|
||||||
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
|
||||||
Some(words) => words,
|
|
||||||
None => return Ok(Vec::new()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let context = QTContext {
|
|
||||||
words_set,
|
|
||||||
synonyms: synonyms_store,
|
|
||||||
postings_lists: postings_lists_store,
|
|
||||||
prefix_postings_lists: prefix_postings_lists_cache_store,
|
|
||||||
};
|
|
||||||
|
|
||||||
let (operation, mapping) = create_query_tree(reader, &context, query).unwrap();
|
|
||||||
println!("{:?}", operation);
|
|
||||||
println!("{:?}", mapping);
|
|
||||||
|
|
||||||
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap();
|
|
||||||
println!("found {} documents", docids.len());
|
|
||||||
println!("number of postings {:?}", queries.len());
|
|
||||||
|
|
||||||
let before = Instant::now();
|
|
||||||
for ((query, input), matches) in queries {
|
|
||||||
// TODO optimize the filter by skipping docids that have already been seen
|
|
||||||
for matches in matches.linear_group_by_key(|m| m.document_id).filter(|ms| docids.contains(&ms[0].document_id)) {
|
|
||||||
// ...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
println!("matches cleaned in {:.02?}", before.elapsed());
|
|
||||||
|
|
||||||
// We delegate the filter work to the distinct query builder,
|
// We delegate the filter work to the distinct query builder,
|
||||||
// specifying a distinct rule that has no effect.
|
// specifying a distinct rule that has no effect.
|
||||||
if filter.is_some() {
|
if filter.is_some() {
|
||||||
@ -102,47 +72,58 @@ where
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let before_bucket_sort = Instant::now();
|
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
||||||
|
Some(words) => words,
|
||||||
|
None => return Ok(Vec::new()),
|
||||||
|
};
|
||||||
|
|
||||||
let (mut automatons, mut query_enhancer) =
|
let context = QTContext {
|
||||||
construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
|
words_set,
|
||||||
|
synonyms: synonyms_store,
|
||||||
|
postings_lists: postings_lists_store,
|
||||||
|
prefix_postings_lists: prefix_postings_lists_cache_store,
|
||||||
|
};
|
||||||
|
|
||||||
if let [automaton] = &automatons[..] {
|
let (operation, mapping) = create_query_tree(reader, &context, query).unwrap();
|
||||||
if automaton.is_prefix && automaton.query.len() <= 4 {
|
println!("{:?}", operation);
|
||||||
let mut prefix = [0; 4];
|
println!("{:?}", mapping);
|
||||||
let len = cmp::min(4, automaton.query.len());
|
|
||||||
prefix[..len].copy_from_slice(&automaton.query.as_bytes()[..len]);
|
|
||||||
|
|
||||||
let mut documents = Vec::new();
|
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap();
|
||||||
let iter = prefix_documents_cache_store.prefix_documents(reader, prefix)?;
|
println!("found {} documents", docids.len());
|
||||||
for result in iter.skip(range.start).take(range.len()) {
|
println!("number of postings {:?}", queries.len());
|
||||||
let (docid, highlights) = result?;
|
|
||||||
documents.push(Document::from_highlights(docid, &highlights));
|
let before = Instant::now();
|
||||||
|
|
||||||
|
let mut bare_matches = Vec::new();
|
||||||
|
mk_arena!(arena);
|
||||||
|
for ((query, input), matches) in queries {
|
||||||
|
|
||||||
|
let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches));
|
||||||
|
// TODO optimize the filter by skipping docids that have already been seen
|
||||||
|
let mut offset = 0;
|
||||||
|
for matches in postings_list_view.linear_group_by_key(|m| m.document_id) {
|
||||||
|
let document_id = matches[0].document_id;
|
||||||
|
if docids.contains(&document_id) {
|
||||||
|
let range = postings_list_view.range(offset, matches.len());
|
||||||
|
let posting_list_index = arena.add(range);
|
||||||
|
let bare_match = BareMatch {
|
||||||
|
document_id,
|
||||||
|
query_index: u16::try_from(query.id).unwrap(),
|
||||||
|
distance: 0,
|
||||||
|
is_exact: true, // TODO where can I find this info?
|
||||||
|
postings_list: posting_list_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
bare_matches.push(bare_match);
|
||||||
}
|
}
|
||||||
|
|
||||||
if !documents.is_empty() {
|
offset += matches.len();
|
||||||
return Ok(documents);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("{:?}", query_enhancer);
|
println!("matches cleaned in {:.02?}", before.elapsed());
|
||||||
|
|
||||||
let before_postings_lists_fetching = Instant::now();
|
let before_bucket_sort = Instant::now();
|
||||||
mk_arena!(arena);
|
|
||||||
let mut bare_matches =
|
|
||||||
fetch_matches(
|
|
||||||
reader,
|
|
||||||
&automatons,
|
|
||||||
&mut arena,
|
|
||||||
main_store,
|
|
||||||
postings_lists_store,
|
|
||||||
prefix_postings_lists_cache_store,
|
|
||||||
)?;
|
|
||||||
debug!("bare matches ({}) retrieved in {:.02?}",
|
|
||||||
bare_matches.len(),
|
|
||||||
before_postings_lists_fetching.elapsed(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let before_raw_documents_presort = Instant::now();
|
let before_raw_documents_presort = Instant::now();
|
||||||
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
||||||
@ -152,14 +133,11 @@ where
|
|||||||
let mut prefiltered_documents = 0;
|
let mut prefiltered_documents = 0;
|
||||||
let mut raw_documents = Vec::new();
|
let mut raw_documents = Vec::new();
|
||||||
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||||
prefiltered_documents += 1;
|
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
|
||||||
if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) {
|
raw_documents.push(raw_document);
|
||||||
raw_documents.push(raw_document);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
debug!("creating {} (original {}) candidates documents took {:.02?}",
|
debug!("creating {} candidates documents took {:.02?}",
|
||||||
raw_documents.len(),
|
raw_documents.len(),
|
||||||
prefiltered_documents,
|
|
||||||
before_raw_documents_building.elapsed(),
|
before_raw_documents_building.elapsed(),
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -178,8 +156,7 @@ where
|
|||||||
let ctx = ContextMut {
|
let ctx = ContextMut {
|
||||||
reader,
|
reader,
|
||||||
postings_lists: &mut arena,
|
postings_lists: &mut arena,
|
||||||
query_enhancer: &mut query_enhancer,
|
query_mapping: &mapping,
|
||||||
automatons: &mut automatons,
|
|
||||||
documents_fields_counts_store,
|
documents_fields_counts_store,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -188,8 +165,7 @@ where
|
|||||||
|
|
||||||
let ctx = Context {
|
let ctx = Context {
|
||||||
postings_lists: &arena,
|
postings_lists: &arena,
|
||||||
query_enhancer: &query_enhancer,
|
query_mapping: &mapping,
|
||||||
automatons: &automatons,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let must_count = criterion.name() == "proximity";
|
let must_count = criterion.name() == "proximity";
|
||||||
@ -223,7 +199,7 @@ where
|
|||||||
debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed));
|
debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed));
|
||||||
|
|
||||||
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
|
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
|
||||||
let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena, searchable_attrs.as_ref()));
|
let iter = iter.map(|rd| Document::from_raw(rd, &arena, searchable_attrs.as_ref()));
|
||||||
let documents = iter.collect();
|
let documents = iter.collect();
|
||||||
|
|
||||||
debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed());
|
debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed());
|
||||||
@ -251,163 +227,7 @@ where
|
|||||||
FI: Fn(DocumentId) -> bool,
|
FI: Fn(DocumentId) -> bool,
|
||||||
FD: Fn(DocumentId) -> Option<u64>,
|
FD: Fn(DocumentId) -> Option<u64>,
|
||||||
{
|
{
|
||||||
let (mut automatons, mut query_enhancer) =
|
unimplemented!()
|
||||||
construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
|
|
||||||
|
|
||||||
let before_postings_lists_fetching = Instant::now();
|
|
||||||
mk_arena!(arena);
|
|
||||||
let mut bare_matches = fetch_matches(
|
|
||||||
reader,
|
|
||||||
&automatons,
|
|
||||||
&mut arena,
|
|
||||||
main_store,
|
|
||||||
postings_lists_store,
|
|
||||||
prefix_postings_lists_cache_store,
|
|
||||||
)?;
|
|
||||||
debug!("bare matches ({}) retrieved in {:.02?}",
|
|
||||||
bare_matches.len(),
|
|
||||||
before_postings_lists_fetching.elapsed(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let before_raw_documents_presort = Instant::now();
|
|
||||||
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
|
||||||
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
|
||||||
|
|
||||||
let before_raw_documents_building = Instant::now();
|
|
||||||
let mut prefiltered_documents = 0;
|
|
||||||
let mut raw_documents = Vec::new();
|
|
||||||
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
|
||||||
prefiltered_documents += 1;
|
|
||||||
if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) {
|
|
||||||
raw_documents.push(raw_document);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
debug!("creating {} (original {}) candidates documents took {:.02?}",
|
|
||||||
raw_documents.len(),
|
|
||||||
prefiltered_documents,
|
|
||||||
before_raw_documents_building.elapsed(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut groups = vec![raw_documents.as_mut_slice()];
|
|
||||||
let mut key_cache = HashMap::new();
|
|
||||||
|
|
||||||
let mut filter_map = HashMap::new();
|
|
||||||
// these two variables informs on the current distinct map and
|
|
||||||
// on the raw offset of the start of the group where the
|
|
||||||
// range.start bound is located according to the distinct function
|
|
||||||
let mut distinct_map = DistinctMap::new(distinct_size);
|
|
||||||
let mut distinct_raw_offset = 0;
|
|
||||||
|
|
||||||
'criteria: for criterion in criteria.as_ref() {
|
|
||||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
|
||||||
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
|
||||||
let mut documents_seen = 0;
|
|
||||||
|
|
||||||
for mut group in tmp_groups {
|
|
||||||
// if this group does not overlap with the requested range,
|
|
||||||
// push it without sorting and splitting it
|
|
||||||
if documents_seen + group.len() < distinct_raw_offset {
|
|
||||||
documents_seen += group.len();
|
|
||||||
groups.push(group);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let ctx = ContextMut {
|
|
||||||
reader,
|
|
||||||
postings_lists: &mut arena,
|
|
||||||
query_enhancer: &mut query_enhancer,
|
|
||||||
automatons: &mut automatons,
|
|
||||||
documents_fields_counts_store,
|
|
||||||
};
|
|
||||||
|
|
||||||
let before_criterion_preparation = Instant::now();
|
|
||||||
criterion.prepare(ctx, &mut group)?;
|
|
||||||
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
|
|
||||||
|
|
||||||
let ctx = Context {
|
|
||||||
postings_lists: &arena,
|
|
||||||
query_enhancer: &query_enhancer,
|
|
||||||
automatons: &automatons,
|
|
||||||
};
|
|
||||||
|
|
||||||
let before_criterion_sort = Instant::now();
|
|
||||||
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
|
|
||||||
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
|
|
||||||
|
|
||||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
|
|
||||||
// we must compute the real distinguished len of this sub-group
|
|
||||||
for document in group.iter() {
|
|
||||||
let filter_accepted = match &filter {
|
|
||||||
Some(filter) => {
|
|
||||||
let entry = filter_map.entry(document.id);
|
|
||||||
*entry.or_insert_with(|| (filter)(document.id))
|
|
||||||
}
|
|
||||||
None => true,
|
|
||||||
};
|
|
||||||
|
|
||||||
if filter_accepted {
|
|
||||||
let entry = key_cache.entry(document.id);
|
|
||||||
let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
|
|
||||||
|
|
||||||
match key.clone() {
|
|
||||||
Some(key) => buf_distinct.register(key),
|
|
||||||
None => buf_distinct.register_without_key(),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// the requested range end is reached: stop computing distinct
|
|
||||||
if buf_distinct.len() >= range.end {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
documents_seen += group.len();
|
|
||||||
groups.push(group);
|
|
||||||
|
|
||||||
// if this sub-group does not overlap with the requested range
|
|
||||||
// we must update the distinct map and its start index
|
|
||||||
if buf_distinct.len() < range.start {
|
|
||||||
buf_distinct.transfert_to_internal();
|
|
||||||
distinct_raw_offset = documents_seen;
|
|
||||||
}
|
|
||||||
|
|
||||||
// we have sort enough documents if the last document sorted is after
|
|
||||||
// the end of the requested range, we can continue to the next criterion
|
|
||||||
if buf_distinct.len() >= range.end {
|
|
||||||
continue 'criteria;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// once we classified the documents related to the current
|
|
||||||
// automatons we save that as the next valid result
|
|
||||||
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
|
|
||||||
|
|
||||||
let mut documents = Vec::with_capacity(range.len());
|
|
||||||
for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) {
|
|
||||||
let filter_accepted = match &filter {
|
|
||||||
Some(_) => filter_map.remove(&raw_document.id).unwrap(),
|
|
||||||
None => true,
|
|
||||||
};
|
|
||||||
|
|
||||||
if filter_accepted {
|
|
||||||
let key = key_cache.remove(&raw_document.id).unwrap();
|
|
||||||
let distinct_accepted = match key {
|
|
||||||
Some(key) => seen.register(key),
|
|
||||||
None => seen.register_without_key(),
|
|
||||||
};
|
|
||||||
|
|
||||||
if distinct_accepted && seen.len() > range.start {
|
|
||||||
documents.push(Document::from_raw(raw_document, &automatons, &arena, searchable_attrs.as_ref()));
|
|
||||||
if documents.len() == range.len() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(documents)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct BareMatch<'tag> {
|
pub struct BareMatch<'tag> {
|
||||||
|
@ -9,13 +9,13 @@ pub struct Attribute;
|
|||||||
impl Criterion for Attribute {
|
impl Criterion for Attribute {
|
||||||
fn name(&self) -> &str { "attribute" }
|
fn name(&self) -> &str { "attribute" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
|
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,9 +11,9 @@ pub struct Exact;
|
|||||||
impl Criterion for Exact {
|
impl Criterion for Exact {
|
||||||
fn name(&self) -> &str { "exact" }
|
fn name(&self) -> &str { "exact" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
|
@ -1,13 +1,16 @@
|
|||||||
use std::cmp::{self, Ordering};
|
use std::cmp::{self, Ordering};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::ops::Range;
|
||||||
|
|
||||||
use compact_arena::SmallArena;
|
use compact_arena::SmallArena;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::{store, RawDocument, MResult};
|
|
||||||
use crate::automaton::QueryEnhancer;
|
use crate::automaton::QueryEnhancer;
|
||||||
use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
|
use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
|
||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
|
use crate::query_tree::QueryId;
|
||||||
|
use crate::{store, RawDocument, MResult};
|
||||||
|
|
||||||
mod typo;
|
mod typo;
|
||||||
mod words;
|
mod words;
|
||||||
@ -30,26 +33,26 @@ pub use self::sort_by_attr::SortByAttr;
|
|||||||
pub trait Criterion {
|
pub trait Criterion {
|
||||||
fn name(&self) -> &str;
|
fn name(&self) -> &str;
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
_ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
_ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
_documents: &mut [RawDocument<'r, 'tag>],
|
_documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn evaluate<'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
|
ctx: &Context<'p, 'tag, 'txn, 'q>,
|
||||||
lhs: &RawDocument<'r, 'tag>,
|
lhs: &RawDocument<'r, 'tag>,
|
||||||
rhs: &RawDocument<'r, 'tag>,
|
rhs: &RawDocument<'r, 'tag>,
|
||||||
) -> Ordering;
|
) -> Ordering;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn eq<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn eq<'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
|
ctx: &Context<'p, 'tag, 'txn, 'q>,
|
||||||
lhs: &RawDocument<'r, 'tag>,
|
lhs: &RawDocument<'r, 'tag>,
|
||||||
rhs: &RawDocument<'r, 'tag>,
|
rhs: &RawDocument<'r, 'tag>,
|
||||||
) -> bool
|
) -> bool
|
||||||
@ -58,18 +61,16 @@ pub trait Criterion {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a> {
|
pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q> {
|
||||||
pub reader: &'h heed::RoTxn<MainT>,
|
pub reader: &'h heed::RoTxn<MainT>,
|
||||||
pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
|
pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
pub query_enhancer: &'q mut QueryEnhancer,
|
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
|
||||||
pub automatons: &'a mut [QueryWordAutomaton],
|
|
||||||
pub documents_fields_counts_store: store::DocumentsFieldsCounts,
|
pub documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Context<'p, 'tag, 'txn, 'q, 'a> {
|
pub struct Context<'p, 'tag, 'txn, 'q> {
|
||||||
pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
|
pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
pub query_enhancer: &'q QueryEnhancer,
|
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
|
||||||
pub automatons: &'a [QueryWordAutomaton],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@ -138,7 +139,7 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
|
|||||||
|
|
||||||
fn prepare_query_distances<'a, 'tag, 'txn>(
|
fn prepare_query_distances<'a, 'tag, 'txn>(
|
||||||
documents: &mut [RawDocument<'a, 'tag>],
|
documents: &mut [RawDocument<'a, 'tag>],
|
||||||
query_enhancer: &QueryEnhancer,
|
query_mapping: &HashMap<QueryId, Range<usize>>,
|
||||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
) {
|
) {
|
||||||
for document in documents {
|
for document in documents {
|
||||||
@ -148,7 +149,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>(
|
|||||||
for m in document.bare_matches.iter() {
|
for m in document.bare_matches.iter() {
|
||||||
if postings_lists[m.postings_list].is_empty() { continue }
|
if postings_lists[m.postings_list].is_empty() { continue }
|
||||||
|
|
||||||
let range = query_enhancer.replacement(m.query_index as u32);
|
let range = query_mapping[&(m.query_index as usize)].clone();
|
||||||
let new_len = cmp::max(range.end as usize, processed.len());
|
let new_len = cmp::max(range.end as usize, processed.len());
|
||||||
processed.resize(new_len, None);
|
processed.resize(new_len, None);
|
||||||
|
|
||||||
@ -169,7 +170,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>(
|
|||||||
fn prepare_bare_matches<'a, 'tag, 'txn>(
|
fn prepare_bare_matches<'a, 'tag, 'txn>(
|
||||||
documents: &mut [RawDocument<'a, 'tag>],
|
documents: &mut [RawDocument<'a, 'tag>],
|
||||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
query_enhancer: &QueryEnhancer,
|
query_mapping: &HashMap<QueryId, Range<usize>>,
|
||||||
) {
|
) {
|
||||||
for document in documents {
|
for document in documents {
|
||||||
if !document.processed_matches.is_empty() { continue }
|
if !document.processed_matches.is_empty() { continue }
|
||||||
@ -190,14 +191,14 @@ fn prepare_bare_matches<'a, 'tag, 'txn>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let processed = multiword_rewrite_matches(&mut processed, query_enhancer);
|
let processed = multiword_rewrite_matches(&mut processed, query_mapping);
|
||||||
document.processed_matches = processed.into_vec();
|
document.processed_matches = processed.into_vec();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn multiword_rewrite_matches(
|
fn multiword_rewrite_matches(
|
||||||
matches: &mut [SimpleMatch],
|
matches: &mut [SimpleMatch],
|
||||||
query_enhancer: &QueryEnhancer,
|
query_mapping: &HashMap<QueryId, Range<usize>>,
|
||||||
) -> SetBuf<SimpleMatch>
|
) -> SetBuf<SimpleMatch>
|
||||||
{
|
{
|
||||||
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
||||||
@ -218,7 +219,7 @@ fn multiword_rewrite_matches(
|
|||||||
// find the biggest padding
|
// find the biggest padding
|
||||||
let mut biggest = 0;
|
let mut biggest = 0;
|
||||||
for match_ in same_word_index {
|
for match_ in same_word_index {
|
||||||
let mut replacement = query_enhancer.replacement(match_.query_index as u32);
|
let mut replacement = query_mapping[&(match_.query_index as usize)].clone();
|
||||||
let replacement_len = replacement.len();
|
let replacement_len = replacement.len();
|
||||||
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
|
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
|
||||||
|
|
||||||
@ -240,7 +241,7 @@ fn multiword_rewrite_matches(
|
|||||||
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
|
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
|
||||||
|
|
||||||
for nmatch_ in next_group {
|
for nmatch_ in next_group {
|
||||||
let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
|
let mut rep = query_mapping[&(nmatch_.query_index as usize)].clone();
|
||||||
let query_index = rep.next().unwrap() as u16;
|
let query_index = rep.next().unwrap() as u16;
|
||||||
if query_index == padmatch.query_index {
|
if query_index == padmatch.query_index {
|
||||||
if !found {
|
if !found {
|
||||||
|
@ -11,13 +11,13 @@ pub struct Proximity;
|
|||||||
impl Criterion for Proximity {
|
impl Criterion for Proximity {
|
||||||
fn name(&self) -> &str { "proximity" }
|
fn name(&self) -> &str { "proximity" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
|
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,13 +7,13 @@ pub struct Typo;
|
|||||||
impl Criterion for Typo {
|
impl Criterion for Typo {
|
||||||
fn name(&self) -> &str { "typo" }
|
fn name(&self) -> &str { "typo" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists);
|
prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,13 +7,13 @@ pub struct Words;
|
|||||||
impl Criterion for Words {
|
impl Criterion for Words {
|
||||||
fn name(&self) -> &str { "words" }
|
fn name(&self) -> &str { "words" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists);
|
prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,13 +9,13 @@ pub struct WordsPosition;
|
|||||||
impl Criterion for WordsPosition {
|
impl Criterion for WordsPosition {
|
||||||
fn name(&self) -> &str { "words position" }
|
fn name(&self) -> &str { "words position" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
|
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,17 +97,19 @@ impl Document {
|
|||||||
#[cfg(not(test))]
|
#[cfg(not(test))]
|
||||||
pub fn from_raw<'a, 'tag, 'txn>(
|
pub fn from_raw<'a, 'tag, 'txn>(
|
||||||
raw_document: RawDocument<'a, 'tag>,
|
raw_document: RawDocument<'a, 'tag>,
|
||||||
automatons: &[QueryWordAutomaton],
|
// automatons: &[QueryWordAutomaton],
|
||||||
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
searchable_attrs: Option<&ReorderedAttrs>,
|
searchable_attrs: Option<&ReorderedAttrs>,
|
||||||
) -> Document
|
) -> Document
|
||||||
{
|
{
|
||||||
let highlights = highlights_from_raw_document(
|
// let highlights = highlights_from_raw_document(
|
||||||
&raw_document,
|
// &raw_document,
|
||||||
automatons,
|
// automatons,
|
||||||
arena,
|
// arena,
|
||||||
searchable_attrs,
|
// searchable_attrs,
|
||||||
);
|
// );
|
||||||
|
|
||||||
|
let highlights = Vec::new();
|
||||||
|
|
||||||
Document { id: raw_document.id, highlights }
|
Document { id: raw_document.id, highlights }
|
||||||
}
|
}
|
||||||
@ -115,19 +117,21 @@ impl Document {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub fn from_raw<'a, 'tag, 'txn>(
|
pub fn from_raw<'a, 'tag, 'txn>(
|
||||||
raw_document: RawDocument<'a, 'tag>,
|
raw_document: RawDocument<'a, 'tag>,
|
||||||
automatons: &[QueryWordAutomaton],
|
// automatons: &[QueryWordAutomaton],
|
||||||
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
searchable_attrs: Option<&ReorderedAttrs>,
|
searchable_attrs: Option<&ReorderedAttrs>,
|
||||||
) -> Document
|
) -> Document
|
||||||
{
|
{
|
||||||
use crate::bucket_sort::SimpleMatch;
|
use crate::bucket_sort::SimpleMatch;
|
||||||
|
|
||||||
let highlights = highlights_from_raw_document(
|
// let highlights = highlights_from_raw_document(
|
||||||
&raw_document,
|
// &raw_document,
|
||||||
automatons,
|
// automatons,
|
||||||
arena,
|
// arena,
|
||||||
searchable_attrs,
|
// searchable_attrs,
|
||||||
);
|
// );
|
||||||
|
|
||||||
|
let highlights = Vec::new();
|
||||||
|
|
||||||
let mut matches = Vec::new();
|
let mut matches = Vec::new();
|
||||||
for sm in raw_document.processed_matches {
|
for sm in raw_document.processed_matches {
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
use compact_arena::SmallArena;
|
use compact_arena::SmallArena;
|
||||||
use itertools::EitherOrBoth;
|
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use crate::DocIndex;
|
use crate::DocIndex;
|
||||||
use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};
|
use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};
|
||||||
@ -19,10 +18,9 @@ pub struct RawDocument<'a, 'tag> {
|
|||||||
impl<'a, 'tag> RawDocument<'a, 'tag> {
|
impl<'a, 'tag> RawDocument<'a, 'tag> {
|
||||||
pub fn new<'txn>(
|
pub fn new<'txn>(
|
||||||
bare_matches: &'a mut [BareMatch<'tag>],
|
bare_matches: &'a mut [BareMatch<'tag>],
|
||||||
automatons: &[QueryWordAutomaton],
|
|
||||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
searchable_attrs: Option<&ReorderedAttrs>,
|
searchable_attrs: Option<&ReorderedAttrs>,
|
||||||
) -> Option<RawDocument<'a, 'tag>>
|
) -> RawDocument<'a, 'tag>
|
||||||
{
|
{
|
||||||
if let Some(reordered_attrs) = searchable_attrs {
|
if let Some(reordered_attrs) = searchable_attrs {
|
||||||
for bm in bare_matches.iter() {
|
for bm in bare_matches.iter() {
|
||||||
@ -42,70 +40,12 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
|
|||||||
|
|
||||||
bare_matches.sort_unstable_by_key(|m| m.query_index);
|
bare_matches.sort_unstable_by_key(|m| m.query_index);
|
||||||
|
|
||||||
let mut previous_word = None;
|
RawDocument {
|
||||||
for i in 0..bare_matches.len() {
|
|
||||||
let a = &bare_matches[i];
|
|
||||||
let auta = &automatons[a.query_index as usize];
|
|
||||||
|
|
||||||
match auta.phrase_query {
|
|
||||||
Some((0, _)) => {
|
|
||||||
let b = match bare_matches.get(i + 1) {
|
|
||||||
Some(b) => b,
|
|
||||||
None => {
|
|
||||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if a.query_index + 1 != b.query_index {
|
|
||||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
let pla = &postings_lists[a.postings_list];
|
|
||||||
let plb = &postings_lists[b.postings_list];
|
|
||||||
|
|
||||||
let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
|
|
||||||
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
|
|
||||||
});
|
|
||||||
|
|
||||||
let mut newa = Vec::new();
|
|
||||||
let mut newb = Vec::new();
|
|
||||||
|
|
||||||
for eb in iter {
|
|
||||||
if let EitherOrBoth::Both(a, b) = eb {
|
|
||||||
newa.push(*a);
|
|
||||||
newb.push(*b);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !newa.is_empty() {
|
|
||||||
previous_word = Some(a.query_index);
|
|
||||||
}
|
|
||||||
|
|
||||||
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
|
|
||||||
postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
|
|
||||||
},
|
|
||||||
Some((1, _)) => {
|
|
||||||
if previous_word.take() != Some(a.query_index - 1) {
|
|
||||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Some((_, _)) => unreachable!(),
|
|
||||||
None => (),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if bare_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
|
|
||||||
return None
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(RawDocument {
|
|
||||||
id: bare_matches[0].document_id,
|
id: bare_matches[0].document_id,
|
||||||
bare_matches,
|
bare_matches,
|
||||||
processed_matches: Vec::new(),
|
processed_matches: Vec::new(),
|
||||||
processed_distances: Vec::new(),
|
processed_distances: Vec::new(),
|
||||||
contains_one_word_field: false,
|
contains_one_word_field: false,
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user