Expose a first working version of the negative keyword

2024-11-23 10:37:41 +08:00 · 2024-03-26 17:31:56 +01:00 · 2024-03-26 17:31:56 +01:00 · e4a3e603b3
commit e4a3e603b3
parent 5ea017b922
2 changed files with 38 additions and 5 deletions
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -209,6 +209,20 @@ fn resolve_universe(
    )
 }
 #[tracing::instrument(level = "trace", skip_all, target = "search")]
 fn resolve_negative_words(
    ctx: &mut SearchContext,
    negative_words: &[Word],
 ) -> Result<RoaringBitmap> {
    let mut negative_bitmap = RoaringBitmap::new();
    for &word in negative_words {
        if let Some(bitmap) = ctx.word_docids(word)? {
            negative_bitmap |= bitmap;
        }
    }
    Ok(negative_bitmap)
 }
 /// Return the list of initialised ranking rules to be used for a placeholder search.
 fn get_ranking_rules_for_placeholder_search<'ctx>(
    ctx: &SearchContext<'ctx>,
@ -620,7 +634,12 @@ pub fn execute_search(
        let tokens = tokenizer.tokenize(query);
        drop(entered);
-        let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?;
+        let (query_terms, negative_words) =
            located_query_terms_from_tokens(ctx, tokens, words_limit)?;
        let ignored_documents = resolve_negative_words(ctx, &negative_words)?;
        universe -= ignored_documents;
        if query_terms.is_empty() {
            // Do a placeholder search instead
            None
@ -630,6 +649,7 @@ pub fn execute_search(
    } else {
        None
    };
    let bucket_sort_output = if let Some(query_terms) = query_terms {
        let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
        located_query_terms = Some(new_located_query_terms);
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@ -6,6 +6,7 @@ use charabia::{SeparatorKind, TokenKind};
 use super::compute_derivations::partially_initialized_term_from_word;
 use super::{LocatedQueryTerm, ZeroTypoTerm};
 use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
 use crate::search::new::Word;
 use crate::{Result, SearchContext, MAX_WORD_LENGTH};
 /// Convert the tokenised search query into a list of located query terms.
@ -14,12 +15,14 @@ pub fn located_query_terms_from_tokens(
    ctx: &mut SearchContext,
    query: NormalizedTokenIter,
    words_limit: Option<usize>,
-) -> Result<Vec<LocatedQueryTerm>> {
+) -> Result<(Vec<LocatedQueryTerm>, Vec<Word>)> {
    let nbr_typos = number_of_typos_allowed(ctx)?;
    let mut located_terms = Vec::new();
    let mut phrase: Option<PhraseBuilder> = None;
    let mut negative_next_token = false;
    let mut negative_words = Vec::new();
    let parts_limit = words_limit.unwrap_or(usize::MAX);
@ -33,7 +36,7 @@ pub fn located_query_terms_from_tokens(
        }
        // early return if word limit is exceeded
        if located_terms.len() >= parts_limit {
-            return Ok(located_terms);
+            return Ok((located_terms, negative_words));
        }
        match token.kind {
@ -46,6 +49,11 @@ pub fn located_query_terms_from_tokens(
                // 3. if the word is the last token of the query we push it as a prefix word.
                if let Some(phrase) = &mut phrase {
                    phrase.push_word(ctx, &token, position)
                } else if negative_next_token {
                    let word = token.lemma().to_string();
                    let word = Word::Original(ctx.word_interner.insert(word));
                    negative_words.push(word);
                    negative_next_token = false;
                } else if peekable.peek().is_some() {
                    match token.kind {
                        TokenKind::Word => {
@ -63,7 +71,7 @@ pub fn located_query_terms_from_tokens(
                            };
                            located_terms.push(located_term);
                        }
-                        TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
+                        TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (),
                    }
                } else {
                    let word = token.lemma();
@ -122,6 +130,10 @@ pub fn located_query_terms_from_tokens(
                    // Start new phrase if the token ends with an opening quote
                    (quote_count % 2 == 1).then_some(PhraseBuilder::empty())
                };
                if phrase.is_none() && token.lemma() == "-" {
                    negative_next_token = true;
                }
            }
            _ => (),
        }
@ -134,7 +146,7 @@ pub fn located_query_terms_from_tokens(
        }
    }
-    Ok(located_terms)
+    Ok((located_terms, negative_words))
 }
 pub fn number_of_typos_allowed<'ctx>(
@ -317,6 +329,7 @@ mod tests {
        // panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
        let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?;
        assert!(located_query_terms.is_empty());
        Ok(())
    }
 }