Merge #664

664: Fix phrase search containing stop words r=ManyTheFish a=Samyak2 # Pull Request This a WIP draft PR I wanted to create to let other potential contributors know that I'm working on this issue. I'll be completing this in a few hours from opening this. ## Related issue Fixes #661 and towards fixing meilisearch/meilisearch#2905 ## What does this PR do? - [x] Change Phrase Operation to use a `Vec<Option<String>>` instead of `Vec<String>` where `None` corresponds to a stop word - [x] Update all other uses of phrase operation - [x] Update `resolve_phrase` - [x] Update `create_primitive_query`? - [x] Add test ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Co-authored-by: Samyak S Sarnayak <samyak201@gmail.com> Co-authored-by: Samyak Sarnayak <samyak201@gmail.com>
2024-11-27 04:25:06 +08:00 · 2022-10-29 13:42:52 +00:00 · 2022-10-29 13:42:52 +00:00 · c965200010
commit c965200010
parent 2e539249cb ecb88143f9
8 changed files with 156 additions and 58 deletions
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@ -579,6 +579,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
            Phrase(words) => {
                let queries = words
                    .iter()
+                    .filter_map(|w| w.as_ref())
                    .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
                    .collect();
                vec![queries]
--- a/milli/src/search/criteria/exactness.rs
+++ b/milli/src/search/criteria/exactness.rs
@ -299,9 +299,11 @@ fn attribute_start_with_docids(
            }
            Phrase(phrase) => {
                for word in phrase {
-                    let wc = ctx.word_position_docids(word, pos)?;
-                    if let Some(word_candidates) = wc {
-                        attribute_candidates_array.push(word_candidates);
+                    if let Some(word) = word {
+                        let wc = ctx.word_position_docids(word, pos)?;
+                        if let Some(word_candidates) = wc {
+                            attribute_candidates_array.push(word_candidates);
+                        }
                    }
                    pos += 1;
                }
@ -323,7 +325,7 @@ fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap {

 #[derive(Debug, Clone)]
 pub enum ExactQueryPart {
-    Phrase(Vec<String>),
+    Phrase(Vec<Option<String>>),
    Synonyms(Vec<String>),
 }

--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@ -418,15 +418,32 @@ pub fn resolve_query_tree(
    resolve_operation(ctx, query_tree, wdcache)
 }

-pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBitmap> {
+pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option<String>]) -> Result<RoaringBitmap> {
    let mut candidates = RoaringBitmap::new();
    let mut first_iter = true;
    let winsize = phrase.len().min(3);
+
+    if phrase.is_empty() {
+        return Ok(candidates);
+    }
+
    for win in phrase.windows(winsize) {
        // Get all the documents with the matching distance for each word pairs.
        let mut bitmaps = Vec::with_capacity(winsize.pow(2));
-        for (offset, s1) in win.iter().enumerate() {
-            for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
+        for (offset, s1) in win.iter().enumerate().filter_map(|(index, word)| {
+            if let Some(word) = word {
+                Some((index, word))
+            } else {
+                None
+            }
+        }) {
+            for (dist, s2) in win.iter().skip(offset + 1).enumerate().filter_map(|(index, word)| {
+                if let Some(word) = word {
+                    Some((index, word))
+                } else {
+                    None
+                }
+            }) {
                if dist == 0 {
                    match ctx.word_pair_proximity_docids(s1, s2, 1)? {
                        Some(m) => bitmaps.push(m),
--- a/milli/src/search/criteria/proximity.rs
+++ b/milli/src/search/criteria/proximity.rs
@ -4,6 +4,7 @@ use std::mem::take;

 use log::debug;
 use roaring::RoaringBitmap;
+use slice_group_by::GroupBy;

 use super::{
    query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context,
@ -187,10 +188,15 @@ fn resolve_candidates<'t>(
            Phrase(words) => {
                if proximity == 0 {
                    let most_left = words
-                        .first()
+                        .iter()
+                        .filter_map(|o| o.as_ref())
+                        .next()
                        .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
                    let most_right = words
-                        .last()
+                        .iter()
+                        .rev()
+                        .filter_map(|o| o.as_ref())
+                        .next()
                        .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });

                    match (most_left, most_right) {
@ -473,14 +479,34 @@ fn resolve_plane_sweep_candidates(
            }
            Phrase(words) => {
                let mut groups_positions = Vec::with_capacity(words.len());
-                for word in words {
-                    let positions = match words_positions.get(word) {
-                        Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
-                        None => return Ok(vec![]),
-                    };
-                    groups_positions.push(positions);
+
+                // group stop_words together.
+                for words in words.linear_group_by_key(Option::is_none) {
+                    // skip if it's a group of stop words.
+                    if matches!(words.first(), None | Some(None)) {
+                        continue;
+                    }
+                    // make a consecutive plane-sweep on the subgroup of words.
+                    let mut subgroup = Vec::with_capacity(words.len());
+                    for word in words.into_iter().map(|w| w.as_deref().unwrap()) {
+                        match words_positions.get(word) {
+                            Some(positions) => {
+                                subgroup.push(positions.iter().map(|p| (p, 0, p)).collect())
+                            }
+                            None => return Ok(vec![]),
+                        }
+                    }
+                    match subgroup.len() {
+                        0 => {}
+                        1 => groups_positions.push(subgroup.pop().unwrap()),
+                        _ => groups_positions.push(plane_sweep(subgroup, true)?),
+                    }
+                }
+                match groups_positions.len() {
+                    0 => vec![],
+                    1 => groups_positions.pop().unwrap(),
+                    _ => plane_sweep(groups_positions, false)?,
                }
-                plane_sweep(groups_positions, true)?
            }
            Or(_, ops) => {
                let mut result = Vec::new();
--- a/milli/src/search/criteria/typo.rs
+++ b/milli/src/search/criteria/typo.rs
@ -9,6 +9,7 @@ use super::{
    query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters,
    CriterionResult,
 };
+use crate::search::criteria::resolve_phrase;
 use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
 use crate::search::{word_derivations, WordDerivationsCache};
 use crate::Result;
@ -256,27 +257,7 @@ fn resolve_candidates<'t>(

        match query_tree {
            And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache),
-            Phrase(words) => {
-                let mut candidates = RoaringBitmap::new();
-                let mut first_loop = true;
-                for slice in words.windows(2) {
-                    let (left, right) = (&slice[0], &slice[1]);
-                    match ctx.word_pair_proximity_docids(left, right, 1)? {
-                        Some(pair_docids) => {
-                            if pair_docids.is_empty() {
-                                return Ok(RoaringBitmap::new());
-                            } else if first_loop {
-                                candidates = pair_docids;
-                                first_loop = false;
-                            } else {
-                                candidates &= pair_docids;
-                            }
-                        }
-                        None => return Ok(RoaringBitmap::new()),
-                    }
-                }
-                Ok(candidates)
-            }
+            Phrase(words) => resolve_phrase(ctx, words),
            Or(_, ops) => {
                let mut candidates = RoaringBitmap::new();
                for op in ops {
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@ -4,7 +4,6 @@ use std::{fmt, mem};

 use charabia::classifier::ClassifiedTokenIter;
 use charabia::{SeparatorKind, TokenKind};
-use fst::Set;
 use roaring::RoaringBitmap;
 use slice_group_by::GroupBy;

@ -18,8 +17,9 @@ type IsPrefix = bool;
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub enum Operation {
    And(Vec<Operation>),
-    // serie of consecutive non prefix and exact words
-    Phrase(Vec<String>),
+    // series of consecutive non prefix and exact words
+    // `None` means a stop word.
+    Phrase(Vec<Option<String>>),
    Or(IsOptionalWord, Vec<Operation>),
    Query(Query),
 }
@ -75,9 +75,13 @@ impl Operation {
        }
    }

-    fn phrase(mut words: Vec<String>) -> Self {
+    fn phrase(mut words: Vec<Option<String>>) -> Self {
        if words.len() == 1 {
-            Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) })
+            if let Some(word) = words.pop().unwrap() {
+                Self::Query(Query { prefix: false, kind: QueryKind::exact(word) })
+            } else {
+                Self::Phrase(words)
+            }
        } else {
            Self::Phrase(words)
        }
@ -264,8 +268,7 @@ impl<'a> QueryTreeBuilder<'a> {
        &self,
        query: ClassifiedTokenIter<A>,
    ) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
-        let stop_words = self.index.stop_words(self.rtxn)?;
-        let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
+        let primitive_query = create_primitive_query(query, self.words_limit);
        if !primitive_query.is_empty() {
            let qt = create_query_tree(
                self,
@ -370,7 +373,10 @@ fn create_query_tree(
            PrimitiveQueryPart::Word(word, prefix) => {
                let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
                if let Some((left, right)) = split_best_frequency(ctx, &word)? {
-                    children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
+                    children.push(Operation::Phrase(vec![
+                        Some(left.to_string()),
+                        Some(right.to_string()),
+                    ]));
                }
                let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
                let exact_words = ctx.exact_words();
@ -583,7 +589,11 @@ fn create_matching_words(
            PrimitiveQueryPart::Phrase(words) => {
                let ids: Vec<_> =
                    (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
-                let words = words.into_iter().map(|w| MatchingWord::new(w, 0, false)).collect();
+                let words = words
+                    .into_iter()
+                    .filter_map(|w| w)
+                    .map(|w| MatchingWord::new(w, 0, false))
+                    .collect();
                matching_words.push((words, ids));
            }
        }
@ -685,7 +695,7 @@ pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;

 #[derive(Debug, Clone)]
 pub enum PrimitiveQueryPart {
-    Phrase(Vec<String>),
+    Phrase(Vec<Option<String>>),
    Word(String, IsPrefix),
 }

@ -710,7 +720,6 @@ impl PrimitiveQueryPart {
 /// the primitive query is an intermediate state to build the query tree.
 fn create_primitive_query<A>(
    query: ClassifiedTokenIter<A>,
-    stop_words: Option<Set<&[u8]>>,
    words_limit: Option<usize>,
 ) -> PrimitiveQuery
 where
@ -735,9 +744,14 @@ where
                // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
                // 3. if the word is the last token of the query we push it as a prefix word.
                if quoted {
-                    phrase.push(token.lemma().to_string());
+                    if let TokenKind::StopWord = token.kind {
+                        phrase.push(None)
+                    } else {
+                        phrase.push(Some(token.lemma().to_string()));
+                    }
                } else if peekable.peek().is_some() {
-                    if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
+                    if let TokenKind::StopWord = token.kind {
+                    } else {
                        primitive_query
                            .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false));
                    }
@ -820,7 +834,7 @@ mod test {
            words_limit: Option<usize>,
            query: ClassifiedTokenIter<A>,
        ) -> Result<Option<(Operation, PrimitiveQuery)>> {
-            let primitive_query = create_primitive_query(query, None, words_limit);
+            let primitive_query = create_primitive_query(query, words_limit);
            if !primitive_query.is_empty() {
                let qt = create_query_tree(
                    self,
@ -1065,7 +1079,7 @@ mod test {
        OR
          AND
            OR
-              PHRASE ["word", "split"]
+              PHRASE [Some("word"), Some("split")]
              Tolerant { word: "wordsplit", max typo: 2 }
            Exact { word: "fish" }
          Tolerant { word: "wordsplitfish", max typo: 1 }
@ -1084,7 +1098,7 @@ mod test {

        insta::assert_debug_snapshot!(query_tree, @r###"
        OR
-          PHRASE ["quickbrown", "fox"]
+          PHRASE [Some("quickbrown"), Some("fox")]
          PrefixTolerant { word: "quickbrownfox", max typo: 2 }
        "###);
    }
@ -1101,7 +1115,7 @@ mod test {

        insta::assert_debug_snapshot!(query_tree, @r###"
        AND
-          PHRASE ["hey", "friends"]
+          PHRASE [Some("hey"), Some("friends")]
          Exact { word: "wooop" }
        "###);
    }
@ -1138,8 +1152,8 @@ mod test {

        insta::assert_debug_snapshot!(query_tree, @r###"
        AND
-          PHRASE ["hey", "friends"]
-          PHRASE ["wooop", "wooop"]
+          PHRASE [Some("hey"), Some("friends")]
+          PHRASE [Some("wooop"), Some("wooop")]
        "###);
    }

@ -1187,7 +1201,7 @@ mod test {
            .unwrap();

        insta::assert_debug_snapshot!(query_tree, @r###"
-        PHRASE ["hey", "my"]
+        PHRASE [Some("hey"), Some("my")]
        "###);
    }

@ -1252,7 +1266,7 @@ mod test {

        insta::assert_debug_snapshot!(query_tree, @r###"
        AND
-          PHRASE ["hey", "my"]
+          PHRASE [Some("hey"), Some("my")]
          Exact { word: "good" }
        "###);
    }
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@ -15,6 +15,7 @@ use slice_group_by::GroupBy;
 mod distinct;
 mod facet_distribution;
 mod filters;
+mod phrase_search;
 mod query_criteria;
 mod sort;
 mod typo_tolerance;
--- a/milli/tests/search/phrase_search.rs
+++ b/milli/tests/search/phrase_search.rs
@ -0,0 +1,56 @@
+use milli::update::{IndexerConfig, Settings};
+use milli::{Criterion, Index, Search, TermsMatchingStrategy};
+
+use crate::search::Criterion::{Attribute, Exactness, Proximity};
+
+fn set_stop_words(index: &Index, stop_words: &[&str]) {
+    let mut wtxn = index.write_txn().unwrap();
+    let config = IndexerConfig::default();
+
+    let mut builder = Settings::new(&mut wtxn, &index, &config);
+    let stop_words = stop_words.into_iter().map(|s| s.to_string()).collect();
+    builder.set_stop_words(stop_words);
+    builder.execute(|_| (), || false).unwrap();
+    wtxn.commit().unwrap();
+}
+
+fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) {
+    let index = super::setup_search_index_with_criteria(&criteria);
+
+    // Add stop_words
+    set_stop_words(&index, &["a", "an", "the", "of"]);
+
+    // Phrase search containing stop words
+    let txn = index.read_txn().unwrap();
+
+    let mut search = Search::new(&txn, &index);
+    search.query("\"the use of force\"");
+    search.limit(10);
+    search.authorize_typos(false);
+    search.terms_matching_strategy(TermsMatchingStrategy::All);
+
+    let result = search.execute().unwrap();
+    // 1 document should match
+    assert_eq!(result.documents_ids.len(), 1);
+
+    // test for a single stop word only, no other search terms
+    let mut search = Search::new(&txn, &index);
+    search.query("\"the\"");
+    search.limit(10);
+    search.authorize_typos(false);
+    search.terms_matching_strategy(TermsMatchingStrategy::All);
+    let result = search.execute().unwrap();
+    assert_eq!(result.documents_ids.len(), 0);
+}
+
+#[test]
+fn test_phrase_search_with_stop_words_no_criteria() {
+    let criteria = [];
+    test_phrase_search_with_stop_words_given_criteria(&criteria);
+}
+
+#[test]
+fn test_phrase_search_with_stop_words_all_criteria() {
+    let criteria = [Proximity, Attribute, Exactness];
+    test_phrase_search_with_stop_words_given_criteria(&criteria);
+}