2019-12-12 00:02:10 +08:00
|
|
|
use compact_arena::SmallArena;
|
|
|
|
use itertools::EitherOrBoth;
|
2019-07-08 01:55:15 +08:00
|
|
|
use sdset::SetBuf;
|
2019-12-13 20:22:54 +08:00
|
|
|
use crate::DocIndex;
|
2019-12-12 00:02:10 +08:00
|
|
|
use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};
|
2019-12-13 20:22:54 +08:00
|
|
|
use crate::reordered_attrs::ReorderedAttrs;
|
2019-07-08 01:55:15 +08:00
|
|
|
|
2019-12-12 00:02:10 +08:00
|
|
|
pub struct RawDocument<'a, 'tag> {
|
|
|
|
pub id: crate::DocumentId,
|
2019-12-13 19:38:54 +08:00
|
|
|
pub bare_matches: &'a mut [BareMatch<'tag>],
|
2019-12-12 00:02:10 +08:00
|
|
|
pub processed_matches: Vec<SimpleMatch>,
|
|
|
|
/// The list of minimum `distance` found
|
|
|
|
pub processed_distances: Vec<Option<u8>>,
|
2019-12-13 18:14:12 +08:00
|
|
|
/// Does this document contains a field
|
|
|
|
/// with one word that is exactly matching
|
|
|
|
pub contains_one_word_field: bool,
|
2019-07-08 01:55:15 +08:00
|
|
|
}
|
|
|
|
|
2019-12-12 00:02:10 +08:00
|
|
|
impl<'a, 'tag> RawDocument<'a, 'tag> {
|
|
|
|
pub fn new<'txn>(
|
2019-12-13 19:38:54 +08:00
|
|
|
bare_matches: &'a mut [BareMatch<'tag>],
|
2019-12-12 00:02:10 +08:00
|
|
|
automatons: &[QueryWordAutomaton],
|
|
|
|
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
2019-12-13 20:22:54 +08:00
|
|
|
searchable_attrs: Option<&ReorderedAttrs>,
|
2019-12-12 00:02:10 +08:00
|
|
|
) -> Option<RawDocument<'a, 'tag>>
|
|
|
|
{
|
2019-12-13 20:22:54 +08:00
|
|
|
if let Some(reordered_attrs) = searchable_attrs {
|
|
|
|
for bm in bare_matches.iter() {
|
|
|
|
let postings_list = &postings_lists[bm.postings_list];
|
|
|
|
|
|
|
|
let mut rewritten = Vec::new();
|
|
|
|
for di in postings_list.iter() {
|
|
|
|
if let Some(attribute) = reordered_attrs.get(di.attribute) {
|
|
|
|
rewritten.push(DocIndex { attribute, ..*di });
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let new_postings = SetBuf::from_dirty(rewritten);
|
|
|
|
postings_lists[bm.postings_list].rewrite_with(new_postings);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-13 19:38:54 +08:00
|
|
|
bare_matches.sort_unstable_by_key(|m| m.query_index);
|
2019-12-12 00:02:10 +08:00
|
|
|
|
|
|
|
let mut previous_word = None;
|
2019-12-13 19:38:54 +08:00
|
|
|
for i in 0..bare_matches.len() {
|
|
|
|
let a = &bare_matches[i];
|
2019-12-12 00:02:10 +08:00
|
|
|
let auta = &automatons[a.query_index as usize];
|
|
|
|
|
|
|
|
match auta.phrase_query {
|
|
|
|
Some((0, _)) => {
|
2019-12-13 19:38:54 +08:00
|
|
|
let b = match bare_matches.get(i + 1) {
|
2019-12-12 00:02:10 +08:00
|
|
|
Some(b) => b,
|
|
|
|
None => {
|
|
|
|
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
if a.query_index + 1 != b.query_index {
|
|
|
|
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
let pla = &postings_lists[a.postings_list];
|
|
|
|
let plb = &postings_lists[b.postings_list];
|
|
|
|
|
2019-12-12 00:36:53 +08:00
|
|
|
let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
|
2019-12-12 00:02:10 +08:00
|
|
|
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
|
|
|
|
});
|
|
|
|
|
|
|
|
let mut newa = Vec::new();
|
|
|
|
let mut newb = Vec::new();
|
|
|
|
|
|
|
|
for eb in iter {
|
|
|
|
if let EitherOrBoth::Both(a, b) = eb {
|
|
|
|
newa.push(*a);
|
|
|
|
newb.push(*b);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !newa.is_empty() {
|
|
|
|
previous_word = Some(a.query_index);
|
|
|
|
}
|
|
|
|
|
|
|
|
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
|
|
|
|
postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
|
|
|
|
},
|
|
|
|
Some((1, _)) => {
|
|
|
|
if previous_word.take() != Some(a.query_index - 1) {
|
|
|
|
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
|
|
|
}
|
|
|
|
},
|
|
|
|
Some((_, _)) => unreachable!(),
|
|
|
|
None => (),
|
|
|
|
}
|
2019-10-18 19:05:28 +08:00
|
|
|
}
|
2019-07-08 01:55:15 +08:00
|
|
|
|
2019-12-13 19:38:54 +08:00
|
|
|
if bare_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
|
2019-12-12 00:02:10 +08:00
|
|
|
return None
|
2019-10-18 19:05:28 +08:00
|
|
|
}
|
2019-07-16 01:34:53 +08:00
|
|
|
|
2019-12-12 00:02:10 +08:00
|
|
|
Some(RawDocument {
|
2019-12-13 19:38:54 +08:00
|
|
|
id: bare_matches[0].document_id,
|
|
|
|
bare_matches,
|
2019-12-12 00:02:10 +08:00
|
|
|
processed_matches: Vec::new(),
|
|
|
|
processed_distances: Vec::new(),
|
2019-12-13 18:14:12 +08:00
|
|
|
contains_one_word_field: false,
|
2019-10-18 19:05:28 +08:00
|
|
|
})
|
2019-07-08 01:55:15 +08:00
|
|
|
}
|
|
|
|
}
|