From e923a3ed6af5554728a52f05f74ab936b875a143 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 9 Jun 2021 17:28:12 +0200 Subject: [PATCH] Replace Consecutive by Phrase in query tree Replace Consecutive by Phrase in query tree in order to remove theorical bugs, due of the Consecutive enum type. --- milli/src/search/criteria/attribute.rs | 11 +++- milli/src/search/criteria/mod.rs | 32 +++++------ milli/src/search/criteria/proximity.rs | 69 ++++++++++++++++------- milli/src/search/criteria/typo.rs | 39 ++++++------- milli/src/search/matching_words.rs | 7 ++- milli/src/search/query_tree.rs | 76 ++++++++++++-------------- 6 files changed, 130 insertions(+), 104 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 6818e02fd..f825623f6 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -578,7 +578,6 @@ fn linear_compute_candidates( fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { - let branch_len = branch.len(); let mut branch_rank = Vec::with_capacity(branch_len); for derivates in branch { @@ -661,7 +660,7 @@ fn linear_compute_candidates( // TODO can we keep refs of Query fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { - use crate::search::criteria::Operation::{And, Or, Consecutive}; + use crate::search::criteria::Operation::{And, Or, Phrase}; fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree { match tail.split_first() { @@ -683,7 +682,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { fn recurse(op: &Operation) -> FlattenedQueryTree { match op { - And(ops) | Consecutive(ops) => { + And(ops) => { ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) }, Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { @@ -691,6 +690,12 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { } else { ops.iter().map(recurse).flatten().collect() }, + Phrase(words) => { + let queries = words.iter().map(|word| { + vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}] + }).collect(); + vec![queries] + } Operation::Query(query) => vec![vec![vec![query.clone()]]], } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index e4ca66b2c..b14d75ddb 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::borrow::Cow; -use anyhow::bail; use roaring::RoaringBitmap; use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; @@ -239,7 +238,7 @@ pub fn resolve_query_tree<'t>( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { - use Operation::{And, Consecutive, Or, Query}; + use Operation::{And, Phrase, Or, Query}; match query_tree { And(ops) => { @@ -261,26 +260,23 @@ pub fn resolve_query_tree<'t>( } Ok(candidates) }, - Consecutive(ops) => { + Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; - for slice in ops.windows(2) { - match (&slice[0], &slice[1]) { - (Operation::Query(left), Operation::Query(right)) => { - match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? { - pair_docids if pair_docids.is_empty() => { - return Ok(RoaringBitmap::new()) - }, - pair_docids if first_loop => { - candidates = pair_docids; - first_loop = false; - }, - pair_docids => { - candidates.intersect_with(&pair_docids); - }, + for slice in words.windows(2) { + let (left, right) = (&slice[0], &slice[1]); + match ctx.word_pair_proximity_docids(left, right, 1)? { + Some(pair_docids) => { + if pair_docids.is_empty() { + return Ok(RoaringBitmap::new()); + } else if first_loop { + candidates = pair_docids; + first_loop = false; + } else { + candidates &= pair_docids; } }, - _ => bail!("invalid consecutive query type"), + None => return Ok(RoaringBitmap::new()) } } Ok(candidates) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index d190ef031..5b33b8fd9 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -171,12 +171,33 @@ fn resolve_candidates<'t>( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { - use Operation::{And, Consecutive, Or, Query}; + use Operation::{And, Phrase, Or}; let result = match query_tree { And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, - Consecutive(ops) => if proximity == 0 { - mdfs(ctx, ops, 0, cache, wdcache)? + Phrase(words) => if proximity == 0 { + let most_left = words.first().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())}); + let most_right = words.last().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())}); + let mut candidates = None; + for slice in words.windows(2) { + let (left, right) = (&slice[0], &slice[1]); + match ctx.word_pair_proximity_docids(left, right, 1)? { + Some(pair_docids) => { + match candidates.as_mut() { + Some(candidates) => *candidates &= pair_docids, + None => candidates = Some(pair_docids), + } + }, + None => { + candidates = None; + break; + } + } + } + match (most_left, most_right, candidates) { + (Some(l), Some(r), Some(c)) => vec![(l, r, c)], + _otherwise => Default::default(), + } } else { Default::default() }, @@ -188,7 +209,7 @@ fn resolve_candidates<'t>( } output }, - Query(q) => if proximity == 0 { + Operation::Query(q) => if proximity == 0 { let candidates = query_docids(ctx, q, wdcache)?; vec![(q.clone(), q.clone(), candidates)] } else { @@ -306,14 +327,9 @@ fn resolve_plane_sweep_candidates( ) -> anyhow::Result> { /// FIXME may be buggy with query like "new new york" - fn plane_sweep<'a>( - ctx: &dyn Context, - operations: &'a [Operation], - docid: DocumentId, + fn plane_sweep( + groups_positions: Vec>, consecutive: bool, - rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, - words_positions: &HashMap, - wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { fn compute_groups_proximity( @@ -362,13 +378,9 @@ fn resolve_plane_sweep_candidates( } } - let groups_len = operations.len(); - let mut groups_positions = Vec::with_capacity(groups_len); + let groups_len = groups_positions.len(); - for operation in operations { - let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; - groups_positions.push(positions.into_iter()); - } + let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); // Pop top elements of each list. let mut current = Vec::with_capacity(groups_len); @@ -441,15 +453,32 @@ fn resolve_plane_sweep_candidates( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { - use Operation::{And, Consecutive, Or}; + use Operation::{And, Phrase, Or}; if let Some(result) = rocache.get(query_tree) { return Ok(result.clone()); } let result = match query_tree { - And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?, - Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?, + And(ops) => { + let mut groups_positions = Vec::with_capacity(ops.len()); + for operation in ops { + let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; + groups_positions.push(positions); + } + plane_sweep(groups_positions, false)? + }, + Phrase(words) => { + let mut groups_positions = Vec::with_capacity(words.len()); + for word in words { + let positions = match words_positions.get(word) { + Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), + None => vec![], + }; + groups_positions.push(positions); + } + plane_sweep(groups_positions, true)? + }, Or(_, ops) => { let mut result = Vec::new(); for op in ops { diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a844417eb..d075b6bca 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -1,6 +1,5 @@ use std::{borrow::Cow, collections::HashMap, mem::take}; -use anyhow::bail; use log::debug; use roaring::RoaringBitmap; @@ -13,7 +12,6 @@ use super::{ CriterionParameters, CriterionResult, query_docids, - query_pair_proximity_docids, resolve_query_tree, }; @@ -174,12 +172,14 @@ fn alterate_query_tree( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result<()> { - use Operation::{And, Consecutive, Or}; + use Operation::{And, Phrase, Or}; match operation { - And(ops) | Consecutive(ops) | Or(_, ops) => { + And(ops) | Or(_, ops) => { ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) }, + // Because Phrases don't allow typos, no alteration can be done. + Phrase(_words) => return Ok(()), Operation::Query(q) => { if let QueryKind::Tolerant { typo, word } = &q.kind { // if no typo is allowed we don't call word_derivations function, @@ -228,32 +228,29 @@ fn resolve_candidates<'t>( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { - use Operation::{And, Consecutive, Or, Query}; + use Operation::{And, Phrase, Or, Query}; match query_tree { And(ops) => { mdfs(ctx, ops, number_typos, cache, wdcache) }, - Consecutive(ops) => { + Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; - for slice in ops.windows(2) { - match (&slice[0], &slice[1]) { - (Operation::Query(left), Operation::Query(right)) => { - match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? { - pair_docids if pair_docids.is_empty() => { - return Ok(RoaringBitmap::new()) - }, - pair_docids if first_loop => { - candidates = pair_docids; - first_loop = false; - }, - pair_docids => { - candidates.intersect_with(&pair_docids); - }, + for slice in words.windows(2) { + let (left, right) = (&slice[0], &slice[1]); + match ctx.word_pair_proximity_docids(left, right, 1)? { + Some(pair_docids) => { + if pair_docids.is_empty() { + return Ok(RoaringBitmap::new()); + } else if first_loop { + candidates = pair_docids; + first_loop = false; + } else { + candidates &= pair_docids; } }, - _ => bail!("invalid consecutive query type"), + None => return Ok(RoaringBitmap::new()) } } Ok(candidates) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 17649849d..c56db4e96 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -52,13 +52,18 @@ impl MatchingWords { fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { match tree { - Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { + Operation::Or(_, ops) | Operation::And(ops) => { ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); }, Operation::Query(Query { prefix, kind }) => { let typo = if kind.is_exact() { 0 } else { kind.typo() }; out.insert((kind.word(), typo, *prefix)); }, + Operation::Phrase(words) => { + for word in words { + out.insert((word, 0, false)); + } + } } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index c87ccfe9b..234fd3266 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -15,7 +15,8 @@ type IsPrefix = bool; #[derive(Clone, PartialEq, Eq, Hash)] pub enum Operation { And(Vec), - Consecutive(Vec), + // serie of consecutive non prefix and exact words + Phrase(Vec), Or(IsOptionalWord, Vec), Query(Query), } @@ -28,9 +29,8 @@ impl fmt::Debug for Operation { writeln!(f, "{:1$}AND", "", depth * 2)?; children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) }, - Operation::Consecutive(children) => { - writeln!(f, "{:1$}CONSECUTIVE", "", depth * 2)?; - children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + Operation::Phrase(children) => { + writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2) }, Operation::Or(true, children) => { writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; @@ -49,14 +49,6 @@ impl fmt::Debug for Operation { } impl Operation { - fn phrase(words: Vec) -> Operation { - Operation::consecutive( - words.into_iter().map(|s| { - Operation::Query(Query { prefix: false, kind: QueryKind::exact(s) }) - }).collect() - ) - } - fn and(mut ops: Vec) -> Self { if ops.len() == 1 { ops.pop().unwrap() @@ -73,11 +65,11 @@ impl Operation { } } - fn consecutive(mut ops: Vec) -> Self { - if ops.len() == 1 { - ops.pop().unwrap() + fn phrase(mut words: Vec) -> Self { + if words.len() == 1 { + Self::Query(Query {prefix: false, kind: QueryKind::exact(words.pop().unwrap())}) } else { - Self::Consecutive(ops) + Self::Phrase(words) } } @@ -256,10 +248,10 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result>, wo /// Returns the maximum number of typos that this Operation allows. pub fn maximum_typo(operation: &Operation) -> usize { - use Operation::{Or, And, Query, Consecutive}; + use Operation::{Or, And, Query, Phrase}; match operation { Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), - And(ops) | Consecutive(ops) => ops.iter().map(maximum_typo).sum::(), + And(ops) => ops.iter().map(maximum_typo).sum::(), Query(q) => q.kind.typo() as usize, + // no typo allowed in phrases + Phrase(_) => 0, } } /// Returns the maximum proximity that this Operation allows. pub fn maximum_proximity(operation: &Operation) -> usize { - use Operation::{Or, And, Query, Consecutive}; + use Operation::{Or, And, Query, Phrase}; match operation { Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), And(ops) => { ops.iter().map(maximum_proximity).sum::() + ops.len().saturating_sub(1) * 7 }, - Query(_) | Consecutive(_) => 0, + Query(_) | Phrase(_) => 0, } } @@ -765,9 +759,9 @@ mod test { let expected = Operation::Or(false, vec![ Operation::And(vec![ Operation::Or(false, vec![ - Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("word".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Phrase(vec![ + "word".to_string(), + "split".to_string(), ]), Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }), ]), @@ -789,9 +783,9 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), + Operation::Phrase(vec![ + "hey".to_string(), + "friends".to_string(), ]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), ]); @@ -809,13 +803,13 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), + Operation::Phrase(vec![ + "hey".to_string(), + "friends".to_string(), ]), - Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), + Operation::Phrase(vec![ + "wooop".to_string(), + "wooop".to_string(), ]), ]); @@ -870,9 +864,9 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + let expected = Operation::Phrase(vec![ + "hey".to_string(), + "my".to_string(), ]); let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); @@ -940,9 +934,9 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + Operation::Phrase(vec![ + "hey".to_string(), + "my".to_string(), ]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), ]);