From d852dc0d2bacede8b9ce3806bba9d3a9a4e29573 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 1 Feb 2022 20:10:16 +0100 Subject: [PATCH 1/2] fix phrase search --- cli/src/main.rs | 12 +++++++- milli/src/search/criteria/mod.rs | 49 +++++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 9d807e8c6..11e203f4d 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -448,8 +448,10 @@ impl Search { #[derive(Debug, StructOpt)] struct SettingsUpdate { - #[structopt(short, long)] + #[structopt(long)] filterable_attributes: Option>, + #[structopt(long)] + criteria: Option>, } impl Performer for SettingsUpdate { @@ -468,6 +470,14 @@ impl Performer for SettingsUpdate { } } + if let Some(criteria) = self.criteria { + if !criteria.is_empty() { + update.set_criteria(criteria); + } else { + update.reset_criteria(); + } + } + let mut bars = Vec::new(); let progesses = MultiProgress::new(); for _ in 0..4 { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 0cad7c013..40b426198 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use std::collections::HashMap; +use itertools::Itertools; use roaring::RoaringBitmap; use self::asc_desc::AscDesc; @@ -318,21 +319,41 @@ pub fn resolve_query_tree<'t>( } Phrase(words) => { let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for slice in words.windows(2) { - let (left, right) = (&slice[0], &slice[1]); - match ctx.word_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => { - if pair_docids.is_empty() { - return Ok(RoaringBitmap::new()); - } else if first_loop { - candidates = pair_docids; - first_loop = false; - } else { - candidates &= pair_docids; - } + let mut first_iter = true; + let winsize = words.len().min(7); + + for win in words.windows(winsize) { + // Get all the word pairs and their compute their relative distance + let dists = win + .iter() + .enumerate() + .cartesian_product(win.iter().enumerate()) + .filter(|(x, y)| y > x) + .map(|((pos1, s1), (pos2, s2))| (s1, s2, pos2 - pos1)); + + let mut bitmaps = Vec::with_capacity(winsize.pow(2)); + + for (s1, s2, d) in dists { + match ctx.word_pair_proximity_docids(s1, s2, d as u8)? { + Some(m) => bitmaps.push(m), + None => return Ok(RoaringBitmap::new()), + } + } + + // We sort the bitmaps so that we perform the small intersections first, which is faster. + bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); + + for bitmap in bitmaps { + if first_iter { + candidates = bitmap; + first_iter = false; + } else { + candidates &= bitmap; + } + // There will be no match, return early + if candidates.is_empty() { + break; } - None => return Ok(RoaringBitmap::new()), } } Ok(candidates) From 13de2510474f55851ef770d7671542fec12d47ec Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 3 Feb 2022 15:01:34 +0100 Subject: [PATCH 2/2] rewrite word pair distance gathering --- milli/src/search/criteria/mod.rs | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 40b426198..8306f5d0e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use std::collections::HashMap; -use itertools::Itertools; use roaring::RoaringBitmap; use self::asc_desc::AscDesc; @@ -323,20 +322,16 @@ pub fn resolve_query_tree<'t>( let winsize = words.len().min(7); for win in words.windows(winsize) { - // Get all the word pairs and their compute their relative distance - let dists = win - .iter() - .enumerate() - .cartesian_product(win.iter().enumerate()) - .filter(|(x, y)| y > x) - .map(|((pos1, s1), (pos2, s2))| (s1, s2, pos2 - pos1)); - + // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - - for (s1, s2, d) in dists { - match ctx.word_pair_proximity_docids(s1, s2, d as u8)? { - Some(m) => bitmaps.push(m), - None => return Ok(RoaringBitmap::new()), + for (offset, s1) in win.iter().enumerate() { + for (dist, s2) in win.iter().skip(offset).enumerate() { + match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this distance, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } } }