Introduce a special word_derivations function for Proximity

This commit is contained in:
Kerollmops 2021-03-09 17:48:05 +01:00
parent facfb4b615
commit d301859bbd
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 48 additions and 39 deletions

View File

@ -67,7 +67,7 @@ pub trait Context {
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>; fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>; fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
fn in_prefix_cache(&self, word: &str) -> bool; fn in_prefix_cache(&self, word: &str) -> bool;
fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>;
} }
pub struct CriteriaBuilder<'t> { pub struct CriteriaBuilder<'t> {
rtxn: &'t heed::RoTxn<'t>, rtxn: &'t heed::RoTxn<'t>,
@ -107,9 +107,13 @@ impl<'a> Context for CriteriaBuilder<'a> {
self.words_prefixes_fst.contains(word) self.words_prefixes_fst.contains(word)
} }
fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>> { fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
let key = (docid, word); let mut words_positions = HashMap::new();
self.index.docid_word_positions.get(self.rtxn, &key) for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? {
let ((_, word), positions) = result?;
words_positions.insert(word.to_string(), positions);
}
Ok(words_positions)
} }
} }
@ -391,7 +395,7 @@ pub mod test {
self.word_prefix_docids.contains_key(&word.to_string()) self.word_prefix_docids.contains_key(&word.to_string())
} }
fn docid_word_positions(&self, _docid: DocumentId, _word: &str) -> heed::Result<Option<RoaringBitmap>> { fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
todo!() todo!()
} }
} }

View File

@ -1,14 +1,13 @@
use std::borrow::Cow;
use std::collections::btree_map::{self, BTreeMap}; use std::collections::btree_map::{self, BTreeMap};
use std::collections::hash_map::{HashMap, Entry}; use std::collections::hash_map::HashMap;
use std::mem::take; use std::mem::take;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use log::debug; use log::debug;
use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}}; use crate::{DocumentId, Position, search::{query_tree::QueryKind}};
use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::query_tree::{maximum_proximity, Operation, Query};
use crate::search::WordDerivationsCache; use crate::search::{build_dfa, WordDerivationsCache};
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
pub struct Proximity<'t> { pub struct Proximity<'t> {
@ -358,7 +357,7 @@ fn resolve_plane_sweep_candidates(
docid: DocumentId, docid: DocumentId,
consecutive: bool, consecutive: bool,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
dwpcache: &mut HashMap<String, Option<RoaringBitmap>>, words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>> ) -> anyhow::Result<Vec<(Position, u8, Position)>>
{ {
@ -400,7 +399,7 @@ fn resolve_plane_sweep_candidates(
let mut groups_positions = Vec::with_capacity(groups_len); let mut groups_positions = Vec::with_capacity(groups_len);
for operation in operations { for operation in operations {
let positions = resolve_operation(ctx, operation, docid, rocache, dwpcache, wdcache)?; let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions.into_iter()); groups_positions.push(positions.into_iter());
} }
@ -476,7 +475,7 @@ fn resolve_plane_sweep_candidates(
query_tree: &'a Operation, query_tree: &'a Operation,
docid: DocumentId, docid: DocumentId,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
dwpcache: &mut HashMap<String, Option<RoaringBitmap>>, words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>> ) -> anyhow::Result<Vec<(Position, u8, Position)>>
{ {
@ -487,44 +486,34 @@ fn resolve_plane_sweep_candidates(
} }
let result = match query_tree { let result = match query_tree {
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, dwpcache, wdcache)?, And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?,
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, dwpcache, wdcache)?, Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?,
Or(_, ops) => { Or(_, ops) => {
let mut result = Vec::new(); let mut result = Vec::new();
for op in ops { for op in ops {
result.extend(resolve_operation(ctx, op, docid, rocache, dwpcache, wdcache)?) result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?)
} }
result.sort_unstable(); result.sort_unstable();
result result
}, },
Operation::Query(Query {prefix, kind}) => { Operation::Query(Query { prefix, kind }) => {
let fst = ctx.words_fst(); let mut result = Vec::new();
let words = match kind { match kind {
QueryKind::Exact { word, .. } => { QueryKind::Exact { word, .. } => {
if *prefix { if *prefix {
Cow::Borrowed(word_derivations(word, true, 0, fst, wdcache)?) let iter = word_derivations(word, true, 0, &words_positions)
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
result.extend(iter);
} else { } else {
Cow::Owned(vec![(word.to_string(), 0)]) if let Some(positions) = words_positions.get(word) {
result.extend(positions.iter().map(|p| (p, 0, p)));
}
} }
}, },
QueryKind::Tolerant { typo, word } => { QueryKind::Tolerant { typo, word } => {
Cow::Borrowed(word_derivations(word, *prefix, *typo, fst, wdcache)?) let iter = word_derivations(word, *prefix, *typo, &words_positions)
} .flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
};
let mut result = Vec::new();
for (word, _) in words.as_ref() {
let positions = match dwpcache.entry(word.to_string()) {
Entry::Occupied(entry) => entry.into_mut(),
Entry::Vacant(entry) => {
let positions = ctx.docid_word_positions(docid, word)?;
entry.insert(positions)
}
};
if let Some(positions) = positions {
let iter = positions.iter().map(|p| (p, 0, p));
result.extend(iter); result.extend(iter);
} }
} }
@ -538,18 +527,34 @@ fn resolve_plane_sweep_candidates(
Ok(result) Ok(result)
} }
let mut word_positions_cache = HashMap::new(); fn word_derivations<'a>(
word: &str,
is_prefix: bool,
max_typo: u8,
words_positions: &'a HashMap<String, RoaringBitmap>,
) -> impl Iterator<Item = &'a RoaringBitmap>
{
let dfa = build_dfa(word, max_typo, is_prefix);
words_positions.iter().filter_map(move |(document_word, positions)| {
use levenshtein_automata::Distance;
match dfa.eval(document_word) {
Distance::Exact(_) => Some(positions),
Distance::AtLeast(_) => None,
}
})
}
let mut resolve_operation_cache = HashMap::new(); let mut resolve_operation_cache = HashMap::new();
let mut candidates = BTreeMap::new(); let mut candidates = BTreeMap::new();
for docid in allowed_candidates { for docid in allowed_candidates {
word_positions_cache.clear(); let words_positions = ctx.docid_words_positions(docid)?;
resolve_operation_cache.clear(); resolve_operation_cache.clear();
let positions = resolve_operation( let positions = resolve_operation(
ctx, ctx,
query_tree, query_tree,
docid, docid,
&mut resolve_operation_cache, &mut resolve_operation_cache,
&mut word_positions_cache, &words_positions,
wdcache, wdcache,
)?; )?;
let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);