Introduce a special word_derivations function for Proximity

This commit is contained in:
Kerollmops 2021-03-09 17:48:05 +01:00
parent facfb4b615
commit d301859bbd
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 48 additions and 39 deletions

View File

@ -67,7 +67,7 @@ pub trait Context {
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
fn in_prefix_cache(&self, word: &str) -> bool;
fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>;
}
pub struct CriteriaBuilder<'t> {
rtxn: &'t heed::RoTxn<'t>,
@ -107,9 +107,13 @@ impl<'a> Context for CriteriaBuilder<'a> {
self.words_prefixes_fst.contains(word)
}
fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>> {
let key = (docid, word);
self.index.docid_word_positions.get(self.rtxn, &key)
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
let mut words_positions = HashMap::new();
for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? {
let ((_, word), positions) = result?;
words_positions.insert(word.to_string(), positions);
}
Ok(words_positions)
}
}
@ -391,7 +395,7 @@ pub mod test {
self.word_prefix_docids.contains_key(&word.to_string())
}
fn docid_word_positions(&self, _docid: DocumentId, _word: &str) -> heed::Result<Option<RoaringBitmap>> {
fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
todo!()
}
}

View File

@ -1,14 +1,13 @@
use std::borrow::Cow;
use std::collections::btree_map::{self, BTreeMap};
use std::collections::hash_map::{HashMap, Entry};
use std::collections::hash_map::HashMap;
use std::mem::take;
use roaring::RoaringBitmap;
use log::debug;
use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}};
use crate::{DocumentId, Position, search::{query_tree::QueryKind}};
use crate::search::query_tree::{maximum_proximity, Operation, Query};
use crate::search::WordDerivationsCache;
use crate::search::{build_dfa, WordDerivationsCache};
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
pub struct Proximity<'t> {
@ -358,7 +357,7 @@ fn resolve_plane_sweep_candidates(
docid: DocumentId,
consecutive: bool,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
dwpcache: &mut HashMap<String, Option<RoaringBitmap>>,
words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>>
{
@ -400,7 +399,7 @@ fn resolve_plane_sweep_candidates(
let mut groups_positions = Vec::with_capacity(groups_len);
for operation in operations {
let positions = resolve_operation(ctx, operation, docid, rocache, dwpcache, wdcache)?;
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions.into_iter());
}
@ -476,7 +475,7 @@ fn resolve_plane_sweep_candidates(
query_tree: &'a Operation,
docid: DocumentId,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
dwpcache: &mut HashMap<String, Option<RoaringBitmap>>,
words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>>
{
@ -487,44 +486,34 @@ fn resolve_plane_sweep_candidates(
}
let result = match query_tree {
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, dwpcache, wdcache)?,
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, dwpcache, wdcache)?,
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?,
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?,
Or(_, ops) => {
let mut result = Vec::new();
for op in ops {
result.extend(resolve_operation(ctx, op, docid, rocache, dwpcache, wdcache)?)
result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?)
}
result.sort_unstable();
result
},
Operation::Query(Query {prefix, kind}) => {
let fst = ctx.words_fst();
let words = match kind {
Operation::Query(Query { prefix, kind }) => {
let mut result = Vec::new();
match kind {
QueryKind::Exact { word, .. } => {
if *prefix {
Cow::Borrowed(word_derivations(word, true, 0, fst, wdcache)?)
let iter = word_derivations(word, true, 0, &words_positions)
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
result.extend(iter);
} else {
Cow::Owned(vec![(word.to_string(), 0)])
if let Some(positions) = words_positions.get(word) {
result.extend(positions.iter().map(|p| (p, 0, p)));
}
}
},
QueryKind::Tolerant { typo, word } => {
Cow::Borrowed(word_derivations(word, *prefix, *typo, fst, wdcache)?)
}
};
let mut result = Vec::new();
for (word, _) in words.as_ref() {
let positions = match dwpcache.entry(word.to_string()) {
Entry::Occupied(entry) => entry.into_mut(),
Entry::Vacant(entry) => {
let positions = ctx.docid_word_positions(docid, word)?;
entry.insert(positions)
}
};
if let Some(positions) = positions {
let iter = positions.iter().map(|p| (p, 0, p));
let iter = word_derivations(word, *prefix, *typo, &words_positions)
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
result.extend(iter);
}
}
@ -538,18 +527,34 @@ fn resolve_plane_sweep_candidates(
Ok(result)
}
let mut word_positions_cache = HashMap::new();
fn word_derivations<'a>(
word: &str,
is_prefix: bool,
max_typo: u8,
words_positions: &'a HashMap<String, RoaringBitmap>,
) -> impl Iterator<Item = &'a RoaringBitmap>
{
let dfa = build_dfa(word, max_typo, is_prefix);
words_positions.iter().filter_map(move |(document_word, positions)| {
use levenshtein_automata::Distance;
match dfa.eval(document_word) {
Distance::Exact(_) => Some(positions),
Distance::AtLeast(_) => None,
}
})
}
let mut resolve_operation_cache = HashMap::new();
let mut candidates = BTreeMap::new();
for docid in allowed_candidates {
word_positions_cache.clear();
let words_positions = ctx.docid_words_positions(docid)?;
resolve_operation_cache.clear();
let positions = resolve_operation(
ctx,
query_tree,
docid,
&mut resolve_operation_cache,
&mut word_positions_cache,
&words_positions,
wdcache,
)?;
let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);