From d301859bbd1c445040abd1bb1b833abcffc71e7c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 9 Mar 2021 17:48:05 +0100 Subject: [PATCH] Introduce a special word_derivations function for Proximity --- milli/src/search/criteria/mod.rs | 14 +++-- milli/src/search/criteria/proximity.rs | 73 ++++++++++++++------------ 2 files changed, 48 insertions(+), 39 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index b2fd7803d..22f081871 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -67,7 +67,7 @@ pub trait Context { fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; - fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result>; + fn docid_words_positions(&self, docid: DocumentId) -> heed::Result>; } pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -107,9 +107,13 @@ impl<'a> Context for CriteriaBuilder<'a> { self.words_prefixes_fst.contains(word) } - fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result> { - let key = (docid, word); - self.index.docid_word_positions.get(self.rtxn, &key) + fn docid_words_positions(&self, docid: DocumentId) -> heed::Result> { + let mut words_positions = HashMap::new(); + for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { + let ((_, word), positions) = result?; + words_positions.insert(word.to_string(), positions); + } + Ok(words_positions) } } @@ -391,7 +395,7 @@ pub mod test { self.word_prefix_docids.contains_key(&word.to_string()) } - fn docid_word_positions(&self, _docid: DocumentId, _word: &str) -> heed::Result> { + fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result> { todo!() } } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index e5f010177..b62eb8cfd 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -1,14 +1,13 @@ -use std::borrow::Cow; use std::collections::btree_map::{self, BTreeMap}; -use std::collections::hash_map::{HashMap, Entry}; +use std::collections::hash_map::HashMap; use std::mem::take; use roaring::RoaringBitmap; use log::debug; -use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}}; +use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; -use crate::search::WordDerivationsCache; +use crate::search::{build_dfa, WordDerivationsCache}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; pub struct Proximity<'t> { @@ -358,7 +357,7 @@ fn resolve_plane_sweep_candidates( docid: DocumentId, consecutive: bool, rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, - dwpcache: &mut HashMap>, + words_positions: &HashMap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { @@ -400,7 +399,7 @@ fn resolve_plane_sweep_candidates( let mut groups_positions = Vec::with_capacity(groups_len); for operation in operations { - let positions = resolve_operation(ctx, operation, docid, rocache, dwpcache, wdcache)?; + let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; groups_positions.push(positions.into_iter()); } @@ -476,7 +475,7 @@ fn resolve_plane_sweep_candidates( query_tree: &'a Operation, docid: DocumentId, rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, - dwpcache: &mut HashMap>, + words_positions: &HashMap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { @@ -487,44 +486,34 @@ fn resolve_plane_sweep_candidates( } let result = match query_tree { - And(ops) => plane_sweep(ctx, ops, docid, false, rocache, dwpcache, wdcache)?, - Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, dwpcache, wdcache)?, + And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?, + Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?, Or(_, ops) => { let mut result = Vec::new(); for op in ops { - result.extend(resolve_operation(ctx, op, docid, rocache, dwpcache, wdcache)?) + result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?) } result.sort_unstable(); result }, - Operation::Query(Query {prefix, kind}) => { - let fst = ctx.words_fst(); - let words = match kind { + Operation::Query(Query { prefix, kind }) => { + let mut result = Vec::new(); + match kind { QueryKind::Exact { word, .. } => { if *prefix { - Cow::Borrowed(word_derivations(word, true, 0, fst, wdcache)?) + let iter = word_derivations(word, true, 0, &words_positions) + .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); + result.extend(iter); } else { - Cow::Owned(vec![(word.to_string(), 0)]) + if let Some(positions) = words_positions.get(word) { + result.extend(positions.iter().map(|p| (p, 0, p))); + } } }, QueryKind::Tolerant { typo, word } => { - Cow::Borrowed(word_derivations(word, *prefix, *typo, fst, wdcache)?) - } - }; - - let mut result = Vec::new(); - for (word, _) in words.as_ref() { - let positions = match dwpcache.entry(word.to_string()) { - Entry::Occupied(entry) => entry.into_mut(), - Entry::Vacant(entry) => { - let positions = ctx.docid_word_positions(docid, word)?; - entry.insert(positions) - } - }; - - if let Some(positions) = positions { - let iter = positions.iter().map(|p| (p, 0, p)); + let iter = word_derivations(word, *prefix, *typo, &words_positions) + .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); result.extend(iter); } } @@ -538,18 +527,34 @@ fn resolve_plane_sweep_candidates( Ok(result) } - let mut word_positions_cache = HashMap::new(); + fn word_derivations<'a>( + word: &str, + is_prefix: bool, + max_typo: u8, + words_positions: &'a HashMap, + ) -> impl Iterator + { + let dfa = build_dfa(word, max_typo, is_prefix); + words_positions.iter().filter_map(move |(document_word, positions)| { + use levenshtein_automata::Distance; + match dfa.eval(document_word) { + Distance::Exact(_) => Some(positions), + Distance::AtLeast(_) => None, + } + }) + } + let mut resolve_operation_cache = HashMap::new(); let mut candidates = BTreeMap::new(); for docid in allowed_candidates { - word_positions_cache.clear(); + let words_positions = ctx.docid_words_positions(docid)?; resolve_operation_cache.clear(); let positions = resolve_operation( ctx, query_tree, docid, &mut resolve_operation_cache, - &mut word_positions_cache, + &words_positions, wdcache, )?; let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);