Support negative phrases

2025-01-19 01:18:31 +08:00 · 2024-03-28 15:51:43 +01:00 · 2024-03-28 15:51:43 +01:00 · 877f4b1045
commit 877f4b1045
parent 69f8b2730d
4 changed files with 85 additions and 18 deletions
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@ -240,6 +240,7 @@ pub(crate) mod tests {
    use super::super::super::located_query_terms_from_tokens;
    use super::*;
    use crate::index::tests::TempIndex;
    use crate::search::new::query_term::ExtractedTokens;
    pub(crate) fn temp_index_with_documents() -> TempIndex {
        let temp_index = TempIndex::new();
@ -261,7 +262,8 @@ pub(crate) mod tests {
        let mut builder = TokenizerBuilder::default();
        let tokenizer = builder.build();
        let tokens = tokenizer.tokenize("split this world");
-        let (query_terms, _) = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
+        let ExtractedTokens { query_terms, .. } =
            located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
        let matching_words = MatchingWords::new(ctx, query_terms);
        assert_eq!(
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -33,7 +33,9 @@ use interner::{DedupInterner, Interner};
 pub use logger::visual::VisualSearchLogger;
 pub use logger::{DefaultSearchLogger, SearchLogger};
 use query_graph::{QueryGraph, QueryNode};
-use query_term::{located_query_terms_from_tokens, LocatedQueryTerm, Phrase, QueryTerm};
+use query_term::{
    located_query_terms_from_tokens, ExtractedTokens, LocatedQueryTerm, Phrase, QueryTerm,
 };
 use ranking_rules::{
    BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait,
 };
@ -223,6 +225,21 @@ fn resolve_negative_words(
    Ok(negative_bitmap)
 }
 #[tracing::instrument(level = "trace", skip_all, target = "search")]
 fn resolve_negative_phrases(
    ctx: &mut SearchContext,
    negative_phrases: &[LocatedQueryTerm],
 ) -> Result<RoaringBitmap> {
    let mut negative_bitmap = RoaringBitmap::new();
    for term in negative_phrases {
        let query_term = ctx.term_interner.get(term.value);
        if let Some(phrase) = query_term.original_phrase() {
            negative_bitmap |= ctx.get_phrase_docids(phrase)?;
        }
    }
    Ok(negative_bitmap)
 }
 /// Return the list of initialised ranking rules to be used for a placeholder search.
 fn get_ranking_rules_for_placeholder_search<'ctx>(
    ctx: &SearchContext<'ctx>,
@ -636,12 +653,15 @@ pub fn execute_search(
        let tokens = tokenizer.tokenize(query);
        drop(entered);
-        let (query_terms, negative_words) =
+        let ExtractedTokens { query_terms, negative_words, negative_phrases } =
            located_query_terms_from_tokens(ctx, tokens, words_limit)?;
-        used_negative_operator = !negative_words.is_empty();
+        used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty();
        let ignored_documents = resolve_negative_words(ctx, &negative_words)?;
        let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?;
        universe -= ignored_documents;
        universe -= ignored_phrases;
        if query_terms.is_empty() {
            // Do a placeholder search instead
--- a/milli/src/search/new/query_term/mod.rs
+++ b/milli/src/search/new/query_term/mod.rs
@ -9,7 +9,9 @@ use std::ops::RangeInclusive;
 use either::Either;
 pub use ntypo_subset::NTypoTermSubset;
-pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed};
+pub use parse_query::{
    located_query_terms_from_tokens, make_ngram, number_of_typos_allowed, ExtractedTokens,
 };
 pub use phrase::Phrase;
 use super::interner::{DedupInterner, Interned};
@ -478,6 +480,11 @@ impl QueryTerm {
    pub fn original_word(&self, ctx: &SearchContext) -> String {
        ctx.word_interner.get(self.original).clone()
    }
    pub fn original_phrase(&self) -> Option<Interned<Phrase>> {
        self.zero_typo.phrase
    }
    pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
        let mut words = BTreeSet::new();
        let mut phrases = BTreeSet::new();
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@ -9,21 +9,34 @@ use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
 use crate::search::new::Word;
 use crate::{Result, SearchContext, MAX_WORD_LENGTH};
 #[derive(Clone)]
 /// Extraction of the content of a query.
 pub struct ExtractedTokens {
    /// The terms to search for in the database.
    pub query_terms: Vec<LocatedQueryTerm>,
    /// The words that must not appear in the results.
    pub negative_words: Vec<Word>,
    /// The phrases that must not appear in the results.
    pub negative_phrases: Vec<LocatedQueryTerm>,
 }
 /// Convert the tokenised search query into a list of located query terms.
 #[tracing::instrument(level = "trace", skip_all, target = "search::query")]
 pub fn located_query_terms_from_tokens(
    ctx: &mut SearchContext,
    query: NormalizedTokenIter,
    words_limit: Option<usize>,
-) -> Result<(Vec<LocatedQueryTerm>, Vec<Word>)> {
+) -> Result<ExtractedTokens> {
    let nbr_typos = number_of_typos_allowed(ctx)?;
-    let mut located_terms = Vec::new();
+    let mut query_terms = Vec::new();
    let mut negative_phrase = false;
    let mut phrase: Option<PhraseBuilder> = None;
    let mut encountered_whitespace = true;
    let mut negative_next_token = false;
    let mut negative_words = Vec::new();
    let mut negative_phrases = Vec::new();
    let parts_limit = words_limit.unwrap_or(usize::MAX);
@ -37,8 +50,8 @@ pub fn located_query_terms_from_tokens(
        }
        // early return if word limit is exceeded
-        if located_terms.len() >= parts_limit {
+        if query_terms.len() >= parts_limit {
-            return Ok((located_terms, negative_words));
+            return Ok(ExtractedTokens { query_terms, negative_words, negative_phrases });
        }
        match token.kind {
@ -71,7 +84,7 @@ pub fn located_query_terms_from_tokens(
                                value: ctx.term_interner.push(term),
                                positions: position..=position,
                            };
-                            located_terms.push(located_term);
+                            query_terms.push(located_term);
                        }
                        TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (),
                    }
@ -88,7 +101,7 @@ pub fn located_query_terms_from_tokens(
                        value: ctx.term_interner.push(term),
                        positions: position..=position,
                    };
-                    located_terms.push(located_term);
+                    query_terms.push(located_term);
                }
            }
            TokenKind::Separator(separator_kind) => {
@ -104,7 +117,14 @@ pub fn located_query_terms_from_tokens(
                    let phrase = if separator_kind == SeparatorKind::Hard {
                        if let Some(phrase) = phrase {
                            if let Some(located_query_term) = phrase.build(ctx) {
-                                located_terms.push(located_query_term)
+                                // as we are evaluating a negative operator we put the phrase
                                // in the negative one *but* we don't reset the negative operator
                                // as we are immediatly starting a new negative phrase.
                                if negative_phrase {
                                    negative_phrases.push(located_query_term);
                                } else {
                                    query_terms.push(located_query_term);
                                }
                            }
                            Some(PhraseBuilder::empty())
                        } else {
@ -125,12 +145,24 @@ pub fn located_query_terms_from_tokens(
                        // Per the check above, quote_count > 0
                        quote_count -= 1;
                        if let Some(located_query_term) = phrase.build(ctx) {
-                            located_terms.push(located_query_term)
+                            // we were evaluating a negative operator so we
                            // put the phrase in the negative phrases
                            if negative_phrase {
                                negative_phrases.push(located_query_term);
                                negative_phrase = false;
                            } else {
                                query_terms.push(located_query_term);
                            }
                        }
                    }
                    // Start new phrase if the token ends with an opening quote
-                    (quote_count % 2 == 1).then_some(PhraseBuilder::empty())
+                    if quote_count % 2 == 1 {
                        negative_phrase = negative_next_token;
                        Some(PhraseBuilder::empty())
                    } else {
                        None
                    }
                };
                negative_next_token =
@ -146,11 +178,16 @@ pub fn located_query_terms_from_tokens(
    // If a quote is never closed, we consider all of the end of the query as a phrase.
    if let Some(phrase) = phrase.take() {
        if let Some(located_query_term) = phrase.build(ctx) {
-            located_terms.push(located_query_term);
+            // put the phrase in the negative set if we are evaluating a negative operator.
            if negative_phrase {
                negative_phrases.push(located_query_term);
            } else {
                query_terms.push(located_query_term);
            }
        }
    }
-    Ok((located_terms, negative_words))
+    Ok(ExtractedTokens { query_terms, negative_words, negative_phrases })
 }
 pub fn number_of_typos_allowed<'ctx>(
@ -331,8 +368,9 @@ mod tests {
        let rtxn = index.read_txn()?;
        let mut ctx = SearchContext::new(&index, &rtxn);
        // panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
-        let (located_query_terms, _) = located_query_terms_from_tokens(&mut ctx, tokens, None)?;
+        let ExtractedTokens { query_terms, .. } =
-        assert!(located_query_terms.is_empty());
+            located_query_terms_from_tokens(&mut ctx, tokens, None)?;
        assert!(query_terms.is_empty());
        Ok(())
    }