From b389be48a02c5fe2de20590466077119df82a227 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 8 Aug 2022 10:37:31 +0200 Subject: [PATCH] Factorize phrase computation --- milli/src/search/criteria/mod.rs | 76 +++++++++++++------------- milli/src/search/criteria/proximity.rs | 39 +------------ 2 files changed, 42 insertions(+), 73 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 4613acb4f..ae9e0c218 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -326,43 +326,7 @@ pub fn resolve_query_tree<'t>( } Ok(candidates) } - Phrase(words) => { - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; - let winsize = words.len().min(7); - - for win in words.windows(winsize) { - // Get all the documents with the matching distance for each word pairs. - let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { - match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - Some(m) => bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(RoaringBitmap::new()), - } - } - } - - // We sort the bitmaps so that we perform the small intersections first, which is faster. - bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); - - for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } - // There will be no match, return early - if candidates.is_empty() { - break; - } - } - } - Ok(candidates) - } + Phrase(words) => resolve_phrase(ctx, &words), Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { @@ -378,6 +342,44 @@ pub fn resolve_query_tree<'t>( resolve_operation(ctx, query_tree, wdcache) } +pub fn resolve_phrase<'t>(ctx: &'t dyn Context, phrase: &[String]) -> Result { + let mut candidates = RoaringBitmap::new(); + let mut first_iter = true; + let winsize = phrase.len().min(7); + + for win in phrase.windows(winsize) { + // Get all the documents with the matching distance for each word pairs. + let mut bitmaps = Vec::with_capacity(winsize.pow(2)); + for (offset, s1) in win.iter().enumerate() { + for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this distance, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } + } + } + + // We sort the bitmaps so that we perform the small intersections first, which is faster. + bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); + + for bitmap in bitmaps { + if first_iter { + candidates = bitmap; + first_iter = false; + } else { + candidates &= bitmap; + } + // There will be no match, return early + if candidates.is_empty() { + break; + } + } + } + Ok(candidates) +} + fn all_word_pair_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 30919585b..e942a7bef 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -6,8 +6,8 @@ use log::debug; use roaring::RoaringBitmap; use super::{ - query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion, - CriterionParameters, CriterionResult, + query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, + Criterion, CriterionParameters, CriterionResult, }; use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; use crate::search::{build_dfa, WordDerivationsCache}; @@ -192,42 +192,9 @@ fn resolve_candidates<'t>( let most_right = words .last() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; - let winsize = words.len().min(7); - for win in words.windows(winsize) { - // Get all the documents with the matching distance for each word pairs. - let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { - match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - Some(m) => bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(Default::default()), - } - } - } - - // We sort the bitmaps so that we perform the small intersections first, which is faster. - bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); - - for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } - // There will be no match, return early - if candidates.is_empty() { - break; - } - } - } match (most_left, most_right) { - (Some(l), Some(r)) => vec![(l, r, candidates)], + (Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, &words)?)], _otherwise => Default::default(), } } else {