Expose a first working version of the negative keyword

This commit is contained in:
Clément Renault 2024-03-26 17:31:56 +01:00
parent 5ea017b922
commit e4a3e603b3
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
2 changed files with 38 additions and 5 deletions

View File

@ -209,6 +209,20 @@ fn resolve_universe(
) )
} }
#[tracing::instrument(level = "trace", skip_all, target = "search")]
fn resolve_negative_words(
ctx: &mut SearchContext,
negative_words: &[Word],
) -> Result<RoaringBitmap> {
let mut negative_bitmap = RoaringBitmap::new();
for &word in negative_words {
if let Some(bitmap) = ctx.word_docids(word)? {
negative_bitmap |= bitmap;
}
}
Ok(negative_bitmap)
}
/// Return the list of initialised ranking rules to be used for a placeholder search. /// Return the list of initialised ranking rules to be used for a placeholder search.
fn get_ranking_rules_for_placeholder_search<'ctx>( fn get_ranking_rules_for_placeholder_search<'ctx>(
ctx: &SearchContext<'ctx>, ctx: &SearchContext<'ctx>,
@ -620,7 +634,12 @@ pub fn execute_search(
let tokens = tokenizer.tokenize(query); let tokens = tokenizer.tokenize(query);
drop(entered); drop(entered);
let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?; let (query_terms, negative_words) =
located_query_terms_from_tokens(ctx, tokens, words_limit)?;
let ignored_documents = resolve_negative_words(ctx, &negative_words)?;
universe -= ignored_documents;
if query_terms.is_empty() { if query_terms.is_empty() {
// Do a placeholder search instead // Do a placeholder search instead
None None
@ -630,6 +649,7 @@ pub fn execute_search(
} else { } else {
None None
}; };
let bucket_sort_output = if let Some(query_terms) = query_terms { let bucket_sort_output = if let Some(query_terms) = query_terms {
let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?; let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
located_query_terms = Some(new_located_query_terms); located_query_terms = Some(new_located_query_terms);

View File

@ -6,6 +6,7 @@ use charabia::{SeparatorKind, TokenKind};
use super::compute_derivations::partially_initialized_term_from_word; use super::compute_derivations::partially_initialized_term_from_word;
use super::{LocatedQueryTerm, ZeroTypoTerm}; use super::{LocatedQueryTerm, ZeroTypoTerm};
use crate::search::new::query_term::{Lazy, Phrase, QueryTerm}; use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
use crate::search::new::Word;
use crate::{Result, SearchContext, MAX_WORD_LENGTH}; use crate::{Result, SearchContext, MAX_WORD_LENGTH};
/// Convert the tokenised search query into a list of located query terms. /// Convert the tokenised search query into a list of located query terms.
@ -14,12 +15,14 @@ pub fn located_query_terms_from_tokens(
ctx: &mut SearchContext, ctx: &mut SearchContext,
query: NormalizedTokenIter, query: NormalizedTokenIter,
words_limit: Option<usize>, words_limit: Option<usize>,
) -> Result<Vec<LocatedQueryTerm>> { ) -> Result<(Vec<LocatedQueryTerm>, Vec<Word>)> {
let nbr_typos = number_of_typos_allowed(ctx)?; let nbr_typos = number_of_typos_allowed(ctx)?;
let mut located_terms = Vec::new(); let mut located_terms = Vec::new();
let mut phrase: Option<PhraseBuilder> = None; let mut phrase: Option<PhraseBuilder> = None;
let mut negative_next_token = false;
let mut negative_words = Vec::new();
let parts_limit = words_limit.unwrap_or(usize::MAX); let parts_limit = words_limit.unwrap_or(usize::MAX);
@ -33,7 +36,7 @@ pub fn located_query_terms_from_tokens(
} }
// early return if word limit is exceeded // early return if word limit is exceeded
if located_terms.len() >= parts_limit { if located_terms.len() >= parts_limit {
return Ok(located_terms); return Ok((located_terms, negative_words));
} }
match token.kind { match token.kind {
@ -46,6 +49,11 @@ pub fn located_query_terms_from_tokens(
// 3. if the word is the last token of the query we push it as a prefix word. // 3. if the word is the last token of the query we push it as a prefix word.
if let Some(phrase) = &mut phrase { if let Some(phrase) = &mut phrase {
phrase.push_word(ctx, &token, position) phrase.push_word(ctx, &token, position)
} else if negative_next_token {
let word = token.lemma().to_string();
let word = Word::Original(ctx.word_interner.insert(word));
negative_words.push(word);
negative_next_token = false;
} else if peekable.peek().is_some() { } else if peekable.peek().is_some() {
match token.kind { match token.kind {
TokenKind::Word => { TokenKind::Word => {
@ -63,7 +71,7 @@ pub fn located_query_terms_from_tokens(
}; };
located_terms.push(located_term); located_terms.push(located_term);
} }
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (),
} }
} else { } else {
let word = token.lemma(); let word = token.lemma();
@ -122,6 +130,10 @@ pub fn located_query_terms_from_tokens(
// Start new phrase if the token ends with an opening quote // Start new phrase if the token ends with an opening quote
(quote_count % 2 == 1).then_some(PhraseBuilder::empty()) (quote_count % 2 == 1).then_some(PhraseBuilder::empty())
}; };
if phrase.is_none() && token.lemma() == "-" {
negative_next_token = true;
}
} }
_ => (), _ => (),
} }
@ -134,7 +146,7 @@ pub fn located_query_terms_from_tokens(
} }
} }
Ok(located_terms) Ok((located_terms, negative_words))
} }
pub fn number_of_typos_allowed<'ctx>( pub fn number_of_typos_allowed<'ctx>(
@ -317,6 +329,7 @@ mod tests {
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785> // panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?; let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?;
assert!(located_query_terms.is_empty()); assert!(located_query_terms.is_empty());
Ok(()) Ok(())
} }
} }