From 12fb509d8470e6d0c3a424756c9838a1efe306d2 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 31 Mar 2021 14:41:22 +0200 Subject: [PATCH] Integrate the stop_words in the querytree remove the stop_words from the querytree except if it was a prefix or a typo --- milli/src/search/query_tree.rs | 60 ++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index f7367d826..fb5b5b87c 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use std::{fmt, cmp, mem}; +use fst::Set; use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; @@ -154,6 +155,10 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; + fn stop_words(&self) -> anyhow::Result>>; + fn is_stop_word(&self, word: &str) -> anyhow::Result { + Ok(self.stop_words()?.map_or(false, |s| s.contains(word))) + } fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { @@ -183,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn synonyms>(&self, _words: &[S]) -> heed::Result>>> { Ok(None) } + + fn stop_words(&self) -> anyhow::Result>> { + self.index.stop_words(self.rtxn) + } } impl<'a> QueryTreeBuilder<'a> { @@ -331,8 +340,7 @@ fn create_query_tree( optional_words: bool, authorize_typos: bool, query: PrimitiveQuery, -) -> anyhow::Result -{ +) -> anyhow::Result { /// Matches on the `PrimitiveQueryPart` and create an operation from it. fn resolve_primitive_part( ctx: &impl Context, @@ -350,7 +358,12 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } - children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); + + let is_stop_word = ctx.is_stop_word(&word)?; + let query = Query { prefix, kind: typos(word, authorize_typos) }; + if query.prefix || query.kind.is_tolerant() || !is_stop_word { + children.push(Operation::Query(query)); + } Ok(Operation::or(false, children)) }, // create a CONSECUTIVE operation wrapping all word in the phrase @@ -365,12 +378,11 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - ) -> anyhow::Result - { + ) -> anyhow::Result { const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) { + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { let mut or_op_children = Vec::new(); for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { @@ -381,23 +393,31 @@ fn create_query_tree( match group { [part] => { - let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; + let operation = + resolve_primitive_part(ctx, authorize_typos, part.clone())?; and_op_children.push(operation); - }, + } words => { - let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); - let words: Vec<_> = words.iter().filter_map(| part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }).collect(); + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); + + let is_stop_word = ctx.is_stop_word(&concat)?; let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; - operations.push(Operation::Query(query)); - and_op_children.push(Operation::or(false, operations)); + if query.prefix || query.kind.is_tolerant() || !is_stop_word { + operations.push(Operation::Query(query)); + and_op_children.push(Operation::or(false, operations)); + } } } @@ -581,6 +601,10 @@ mod test { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) } + + fn stop_words(&self) -> anyhow::Result>> { + Ok(None) + } } impl Default for TestContext {