mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 09:04:59 +08:00
Replace Consecutive by Phrase in query tree
Replace Consecutive by Phrase in query tree in order to remove theorical bugs, due of the Consecutive enum type.
This commit is contained in:
parent
bc02031793
commit
e923a3ed6a
@ -578,7 +578,6 @@ fn linear_compute_candidates(
|
||||
fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
|
||||
let mut min_rank = u64::max_value();
|
||||
for branch in branches {
|
||||
|
||||
let branch_len = branch.len();
|
||||
let mut branch_rank = Vec::with_capacity(branch_len);
|
||||
for derivates in branch {
|
||||
@ -661,7 +660,7 @@ fn linear_compute_candidates(
|
||||
|
||||
// TODO can we keep refs of Query
|
||||
fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
|
||||
use crate::search::criteria::Operation::{And, Or, Consecutive};
|
||||
use crate::search::criteria::Operation::{And, Or, Phrase};
|
||||
|
||||
fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree {
|
||||
match tail.split_first() {
|
||||
@ -683,7 +682,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
|
||||
|
||||
fn recurse(op: &Operation) -> FlattenedQueryTree {
|
||||
match op {
|
||||
And(ops) | Consecutive(ops) => {
|
||||
And(ops) => {
|
||||
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
|
||||
},
|
||||
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
|
||||
@ -691,6 +690,12 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
|
||||
} else {
|
||||
ops.iter().map(recurse).flatten().collect()
|
||||
},
|
||||
Phrase(words) => {
|
||||
let queries = words.iter().map(|word| {
|
||||
vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}]
|
||||
}).collect();
|
||||
vec![queries]
|
||||
}
|
||||
Operation::Query(query) => vec![vec![vec![query.clone()]]],
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
use std::collections::HashMap;
|
||||
use std::borrow::Cow;
|
||||
|
||||
use anyhow::bail;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}};
|
||||
@ -239,7 +238,7 @@ pub fn resolve_query_tree<'t>(
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or, Query};
|
||||
use Operation::{And, Phrase, Or, Query};
|
||||
|
||||
match query_tree {
|
||||
And(ops) => {
|
||||
@ -261,26 +260,23 @@ pub fn resolve_query_tree<'t>(
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
Consecutive(ops) => {
|
||||
Phrase(words) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_loop = true;
|
||||
for slice in ops.windows(2) {
|
||||
match (&slice[0], &slice[1]) {
|
||||
(Operation::Query(left), Operation::Query(right)) => {
|
||||
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? {
|
||||
pair_docids if pair_docids.is_empty() => {
|
||||
return Ok(RoaringBitmap::new())
|
||||
},
|
||||
pair_docids if first_loop => {
|
||||
for slice in words.windows(2) {
|
||||
let (left, right) = (&slice[0], &slice[1]);
|
||||
match ctx.word_pair_proximity_docids(left, right, 1)? {
|
||||
Some(pair_docids) => {
|
||||
if pair_docids.is_empty() {
|
||||
return Ok(RoaringBitmap::new());
|
||||
} else if first_loop {
|
||||
candidates = pair_docids;
|
||||
first_loop = false;
|
||||
},
|
||||
pair_docids => {
|
||||
candidates.intersect_with(&pair_docids);
|
||||
},
|
||||
} else {
|
||||
candidates &= pair_docids;
|
||||
}
|
||||
},
|
||||
_ => bail!("invalid consecutive query type"),
|
||||
None => return Ok(RoaringBitmap::new())
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
|
@ -171,12 +171,33 @@ fn resolve_candidates<'t>(
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or, Query};
|
||||
use Operation::{And, Phrase, Or};
|
||||
|
||||
let result = match query_tree {
|
||||
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
|
||||
Consecutive(ops) => if proximity == 0 {
|
||||
mdfs(ctx, ops, 0, cache, wdcache)?
|
||||
Phrase(words) => if proximity == 0 {
|
||||
let most_left = words.first().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())});
|
||||
let most_right = words.last().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())});
|
||||
let mut candidates = None;
|
||||
for slice in words.windows(2) {
|
||||
let (left, right) = (&slice[0], &slice[1]);
|
||||
match ctx.word_pair_proximity_docids(left, right, 1)? {
|
||||
Some(pair_docids) => {
|
||||
match candidates.as_mut() {
|
||||
Some(candidates) => *candidates &= pair_docids,
|
||||
None => candidates = Some(pair_docids),
|
||||
}
|
||||
},
|
||||
None => {
|
||||
candidates = None;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
match (most_left, most_right, candidates) {
|
||||
(Some(l), Some(r), Some(c)) => vec![(l, r, c)],
|
||||
_otherwise => Default::default(),
|
||||
}
|
||||
} else {
|
||||
Default::default()
|
||||
},
|
||||
@ -188,7 +209,7 @@ fn resolve_candidates<'t>(
|
||||
}
|
||||
output
|
||||
},
|
||||
Query(q) => if proximity == 0 {
|
||||
Operation::Query(q) => if proximity == 0 {
|
||||
let candidates = query_docids(ctx, q, wdcache)?;
|
||||
vec![(q.clone(), q.clone(), candidates)]
|
||||
} else {
|
||||
@ -306,14 +327,9 @@ fn resolve_plane_sweep_candidates(
|
||||
) -> anyhow::Result<BTreeMap<u8, RoaringBitmap>>
|
||||
{
|
||||
/// FIXME may be buggy with query like "new new york"
|
||||
fn plane_sweep<'a>(
|
||||
ctx: &dyn Context,
|
||||
operations: &'a [Operation],
|
||||
docid: DocumentId,
|
||||
fn plane_sweep(
|
||||
groups_positions: Vec<Vec<(Position, u8, Position)>>,
|
||||
consecutive: bool,
|
||||
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
||||
words_positions: &HashMap<String, RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
||||
{
|
||||
fn compute_groups_proximity(
|
||||
@ -362,13 +378,9 @@ fn resolve_plane_sweep_candidates(
|
||||
}
|
||||
}
|
||||
|
||||
let groups_len = operations.len();
|
||||
let mut groups_positions = Vec::with_capacity(groups_len);
|
||||
let groups_len = groups_positions.len();
|
||||
|
||||
for operation in operations {
|
||||
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
|
||||
groups_positions.push(positions.into_iter());
|
||||
}
|
||||
let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
|
||||
|
||||
// Pop top elements of each list.
|
||||
let mut current = Vec::with_capacity(groups_len);
|
||||
@ -441,15 +453,32 @@ fn resolve_plane_sweep_candidates(
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or};
|
||||
use Operation::{And, Phrase, Or};
|
||||
|
||||
if let Some(result) = rocache.get(query_tree) {
|
||||
return Ok(result.clone());
|
||||
}
|
||||
|
||||
let result = match query_tree {
|
||||
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?,
|
||||
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?,
|
||||
And(ops) => {
|
||||
let mut groups_positions = Vec::with_capacity(ops.len());
|
||||
for operation in ops {
|
||||
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
|
||||
groups_positions.push(positions);
|
||||
}
|
||||
plane_sweep(groups_positions, false)?
|
||||
},
|
||||
Phrase(words) => {
|
||||
let mut groups_positions = Vec::with_capacity(words.len());
|
||||
for word in words {
|
||||
let positions = match words_positions.get(word) {
|
||||
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
|
||||
None => vec![],
|
||||
};
|
||||
groups_positions.push(positions);
|
||||
}
|
||||
plane_sweep(groups_positions, true)?
|
||||
},
|
||||
Or(_, ops) => {
|
||||
let mut result = Vec::new();
|
||||
for op in ops {
|
||||
|
@ -1,6 +1,5 @@
|
||||
use std::{borrow::Cow, collections::HashMap, mem::take};
|
||||
|
||||
use anyhow::bail;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
@ -13,7 +12,6 @@ use super::{
|
||||
CriterionParameters,
|
||||
CriterionResult,
|
||||
query_docids,
|
||||
query_pair_proximity_docids,
|
||||
resolve_query_tree,
|
||||
};
|
||||
|
||||
@ -174,12 +172,14 @@ fn alterate_query_tree(
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or};
|
||||
use Operation::{And, Phrase, Or};
|
||||
|
||||
match operation {
|
||||
And(ops) | Consecutive(ops) | Or(_, ops) => {
|
||||
And(ops) | Or(_, ops) => {
|
||||
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
|
||||
},
|
||||
// Because Phrases don't allow typos, no alteration can be done.
|
||||
Phrase(_words) => return Ok(()),
|
||||
Operation::Query(q) => {
|
||||
if let QueryKind::Tolerant { typo, word } = &q.kind {
|
||||
// if no typo is allowed we don't call word_derivations function,
|
||||
@ -228,32 +228,29 @@ fn resolve_candidates<'t>(
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or, Query};
|
||||
use Operation::{And, Phrase, Or, Query};
|
||||
|
||||
match query_tree {
|
||||
And(ops) => {
|
||||
mdfs(ctx, ops, number_typos, cache, wdcache)
|
||||
},
|
||||
Consecutive(ops) => {
|
||||
Phrase(words) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_loop = true;
|
||||
for slice in ops.windows(2) {
|
||||
match (&slice[0], &slice[1]) {
|
||||
(Operation::Query(left), Operation::Query(right)) => {
|
||||
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? {
|
||||
pair_docids if pair_docids.is_empty() => {
|
||||
return Ok(RoaringBitmap::new())
|
||||
},
|
||||
pair_docids if first_loop => {
|
||||
for slice in words.windows(2) {
|
||||
let (left, right) = (&slice[0], &slice[1]);
|
||||
match ctx.word_pair_proximity_docids(left, right, 1)? {
|
||||
Some(pair_docids) => {
|
||||
if pair_docids.is_empty() {
|
||||
return Ok(RoaringBitmap::new());
|
||||
} else if first_loop {
|
||||
candidates = pair_docids;
|
||||
first_loop = false;
|
||||
},
|
||||
pair_docids => {
|
||||
candidates.intersect_with(&pair_docids);
|
||||
},
|
||||
} else {
|
||||
candidates &= pair_docids;
|
||||
}
|
||||
},
|
||||
_ => bail!("invalid consecutive query type"),
|
||||
None => return Ok(RoaringBitmap::new())
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
|
@ -52,13 +52,18 @@ impl MatchingWords {
|
||||
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
|
||||
fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
|
||||
match tree {
|
||||
Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => {
|
||||
Operation::Or(_, ops) | Operation::And(ops) => {
|
||||
ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
|
||||
},
|
||||
Operation::Query(Query { prefix, kind }) => {
|
||||
let typo = if kind.is_exact() { 0 } else { kind.typo() };
|
||||
out.insert((kind.word(), typo, *prefix));
|
||||
},
|
||||
Operation::Phrase(words) => {
|
||||
for word in words {
|
||||
out.insert((word, 0, false));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -15,7 +15,8 @@ type IsPrefix = bool;
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum Operation {
|
||||
And(Vec<Operation>),
|
||||
Consecutive(Vec<Operation>),
|
||||
// serie of consecutive non prefix and exact words
|
||||
Phrase(Vec<String>),
|
||||
Or(IsOptionalWord, Vec<Operation>),
|
||||
Query(Query),
|
||||
}
|
||||
@ -28,9 +29,8 @@ impl fmt::Debug for Operation {
|
||||
writeln!(f, "{:1$}AND", "", depth * 2)?;
|
||||
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
||||
},
|
||||
Operation::Consecutive(children) => {
|
||||
writeln!(f, "{:1$}CONSECUTIVE", "", depth * 2)?;
|
||||
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
||||
Operation::Phrase(children) => {
|
||||
writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
|
||||
},
|
||||
Operation::Or(true, children) => {
|
||||
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
|
||||
@ -49,14 +49,6 @@ impl fmt::Debug for Operation {
|
||||
}
|
||||
|
||||
impl Operation {
|
||||
fn phrase(words: Vec<String>) -> Operation {
|
||||
Operation::consecutive(
|
||||
words.into_iter().map(|s| {
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(s) })
|
||||
}).collect()
|
||||
)
|
||||
}
|
||||
|
||||
fn and(mut ops: Vec<Self>) -> Self {
|
||||
if ops.len() == 1 {
|
||||
ops.pop().unwrap()
|
||||
@ -73,11 +65,11 @@ impl Operation {
|
||||
}
|
||||
}
|
||||
|
||||
fn consecutive(mut ops: Vec<Self>) -> Self {
|
||||
if ops.len() == 1 {
|
||||
ops.pop().unwrap()
|
||||
fn phrase(mut words: Vec<String>) -> Self {
|
||||
if words.len() == 1 {
|
||||
Self::Query(Query {prefix: false, kind: QueryKind::exact(words.pop().unwrap())})
|
||||
} else {
|
||||
Self::Consecutive(ops)
|
||||
Self::Phrase(words)
|
||||
}
|
||||
}
|
||||
|
||||
@ -256,10 +248,10 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, left, right)| Operation::Consecutive(
|
||||
Ok(best.map(|(_, left, right)| Operation::Phrase(
|
||||
vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(left.to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(right.to_string()) })
|
||||
left.to_string(),
|
||||
right.to_string()
|
||||
]
|
||||
)))
|
||||
}
|
||||
@ -494,24 +486,26 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
|
||||
|
||||
/// Returns the maximum number of typos that this Operation allows.
|
||||
pub fn maximum_typo(operation: &Operation) -> usize {
|
||||
use Operation::{Or, And, Query, Consecutive};
|
||||
use Operation::{Or, And, Query, Phrase};
|
||||
match operation {
|
||||
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
|
||||
And(ops) | Consecutive(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
|
||||
And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
|
||||
Query(q) => q.kind.typo() as usize,
|
||||
// no typo allowed in phrases
|
||||
Phrase(_) => 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the maximum proximity that this Operation allows.
|
||||
pub fn maximum_proximity(operation: &Operation) -> usize {
|
||||
use Operation::{Or, And, Query, Consecutive};
|
||||
use Operation::{Or, And, Query, Phrase};
|
||||
match operation {
|
||||
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
|
||||
And(ops) => {
|
||||
ops.iter().map(maximum_proximity).sum::<usize>()
|
||||
+ ops.len().saturating_sub(1) * 7
|
||||
},
|
||||
Query(_) | Consecutive(_) => 0,
|
||||
Query(_) | Phrase(_) => 0,
|
||||
}
|
||||
}
|
||||
|
||||
@ -765,9 +759,9 @@ mod test {
|
||||
let expected = Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Or(false, vec![
|
||||
Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("word".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Phrase(vec![
|
||||
"word".to_string(),
|
||||
"split".to_string(),
|
||||
]),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }),
|
||||
]),
|
||||
@ -789,9 +783,9 @@ mod test {
|
||||
let tokens = result.tokens();
|
||||
|
||||
let expected = Operation::And(vec![
|
||||
Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }),
|
||||
Operation::Phrase(vec![
|
||||
"hey".to_string(),
|
||||
"friends".to_string(),
|
||||
]),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
|
||||
]);
|
||||
@ -809,13 +803,13 @@ mod test {
|
||||
let tokens = result.tokens();
|
||||
|
||||
let expected = Operation::And(vec![
|
||||
Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }),
|
||||
Operation::Phrase(vec![
|
||||
"hey".to_string(),
|
||||
"friends".to_string(),
|
||||
]),
|
||||
Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
|
||||
Operation::Phrase(vec![
|
||||
"wooop".to_string(),
|
||||
"wooop".to_string(),
|
||||
]),
|
||||
]);
|
||||
|
||||
@ -870,9 +864,9 @@ mod test {
|
||||
let result = analyzer.analyze(query);
|
||||
let tokens = result.tokens();
|
||||
|
||||
let expected = Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
|
||||
let expected = Operation::Phrase(vec![
|
||||
"hey".to_string(),
|
||||
"my".to_string(),
|
||||
]);
|
||||
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
|
||||
|
||||
@ -940,9 +934,9 @@ mod test {
|
||||
let tokens = result.tokens();
|
||||
|
||||
let expected = Operation::And(vec![
|
||||
Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
|
||||
Operation::Phrase(vec![
|
||||
"hey".to_string(),
|
||||
"my".to_string(),
|
||||
]),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
|
||||
]);
|
||||
|
Loading…
Reference in New Issue
Block a user