From da036dcc3e63203c1a19d7ade7cf0ec0a4ebc598 Mon Sep 17 00:00:00 2001 From: tamo Date: Thu, 8 Apr 2021 15:12:37 +0200 Subject: [PATCH 1/2] Revert "Integrate the stop_words in the querytree" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 12fb509d8470e6d0c3a424756c9838a1efe306d2. We revert this commit because it's causing the bug #150. The initial algorithm we implemented for the stop_words was: 1. remove the stop_words from the dataset 2. keep the stop_words in the query to see if we can generate new words by integrating typos or if the word was a prefix => This was causing the bug since, in the case of “The hobbit”, we were **always** looking for something starting with “t he” or “th e” instead of ignoring the word completely. For now we are going to fix the bug by completely ignoring the stop_words in the query. This could cause another problem were someone mistyped a normal word and ended up typing a stop_word. For example imagine someone searching for the music “Won't he do it”. If that person misplace one space and write “Won' the do it” then we will loose a part of the request. One fix would be to update our query tree to something like that: --------------------- OR OR TOLERANT hobbit # the first option is to ignore the stop_word AND CONSECUTIVE # the second option is to do as we are doing EXACT t # currently EXACT he TOLERANT hobbit --------------------- This would increase drastically the size of our query tree on request with a lot of stop_words. For example think of “The Lord Of The Rings”. For now whatsoever we decided we were going to ignore this problem and consider that it doesn't reduce too much the relevancy of the search to do that while it improves the performances. --- milli/src/search/query_tree.rs | 60 ++++++++++------------------------ 1 file changed, 18 insertions(+), 42 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index fb5b5b87c..f7367d826 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,7 +1,6 @@ use std::collections::HashSet; use std::{fmt, cmp, mem}; -use fst::Set; use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; @@ -155,10 +154,6 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn stop_words(&self) -> anyhow::Result>>; - fn is_stop_word(&self, word: &str) -> anyhow::Result { - Ok(self.stop_words()?.map_or(false, |s| s.contains(word))) - } fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { @@ -188,10 +183,6 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn synonyms>(&self, _words: &[S]) -> heed::Result>>> { Ok(None) } - - fn stop_words(&self) -> anyhow::Result>> { - self.index.stop_words(self.rtxn) - } } impl<'a> QueryTreeBuilder<'a> { @@ -340,7 +331,8 @@ fn create_query_tree( optional_words: bool, authorize_typos: bool, query: PrimitiveQuery, -) -> anyhow::Result { +) -> anyhow::Result +{ /// Matches on the `PrimitiveQueryPart` and create an operation from it. fn resolve_primitive_part( ctx: &impl Context, @@ -358,12 +350,7 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } - - let is_stop_word = ctx.is_stop_word(&word)?; - let query = Query { prefix, kind: typos(word, authorize_typos) }; - if query.prefix || query.kind.is_tolerant() || !is_stop_word { - children.push(Operation::Query(query)); - } + children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); Ok(Operation::or(false, children)) }, // create a CONSECUTIVE operation wrapping all word in the phrase @@ -378,11 +365,12 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - ) -> anyhow::Result { + ) -> anyhow::Result + { const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) { let mut or_op_children = Vec::new(); for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { @@ -393,31 +381,23 @@ fn create_query_tree( match group { [part] => { - let operation = - resolve_primitive_part(ctx, authorize_typos, part.clone())?; + let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; and_op_children.push(operation); - } + }, words => { - let is_prefix = words.last().map_or(false, |part| part.is_prefix()); - let words: Vec<_> = words - .iter() - .filter_map(|part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }) - .collect(); + let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); + let words: Vec<_> = words.iter().filter_map(| part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }).collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); - - let is_stop_word = ctx.is_stop_word(&concat)?; let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; - if query.prefix || query.kind.is_tolerant() || !is_stop_word { - operations.push(Operation::Query(query)); - and_op_children.push(Operation::or(false, operations)); - } + operations.push(Operation::Query(query)); + and_op_children.push(Operation::or(false, operations)); } } @@ -601,10 +581,6 @@ mod test { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) } - - fn stop_words(&self) -> anyhow::Result>> { - Ok(None) - } } impl Default for TestContext { From dcb00b2e54480a199a4d60ccef31d4c3d021af6b Mon Sep 17 00:00:00 2001 From: tamo Date: Thu, 8 Apr 2021 21:21:20 +0200 Subject: [PATCH 2/2] test a new implementation of the stop_words --- milli/src/search/mod.rs | 9 ++++++++- milli/src/search/query_tree.rs | 22 +++++++++++++--------- milli/src/update/settings.rs | 7 ++++--- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index c88800f38..a8cde213b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -91,7 +91,14 @@ impl<'a> Search<'a> { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); builder.authorize_typos(self.authorize_typos); - let analyzer = Analyzer::>::new(AnalyzerConfig::default()); + // We make sure that the analyzer is aware of the stop words + // this ensures that the query builder is able to properly remove them. + let mut config = AnalyzerConfig::default(); + let stop_words = self.index.stop_words(self.rtxn)?; + if let Some(ref stop_words) = stop_words { + config.stop_words(stop_words); + } + let analyzer = Analyzer::new(config); let result = analyzer.analyze(query); let tokens = result.tokens(); builder.build(tokens)? diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index f7367d826..1941f0c6f 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use std::{fmt, cmp, mem}; +use fst::Set; use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; @@ -220,7 +221,8 @@ impl<'a> QueryTreeBuilder<'a> { /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) pub fn build(&self, query: TokenStream) -> anyhow::Result> { - let primitive_query = create_primitive_query(query); + let stop_words = self.index.stop_words(self.rtxn)?; + let primitive_query = create_primitive_query(query, stop_words); if !primitive_query.is_empty() { create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some) } else { @@ -370,7 +372,7 @@ fn create_query_tree( const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) { + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { let mut or_op_children = Vec::new(); for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { @@ -385,8 +387,8 @@ fn create_query_tree( and_op_children.push(operation); }, words => { - let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); - let words: Vec<_> = words.iter().filter_map(| part| { + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words.iter().filter_map(|part| { if let PrimitiveQueryPart::Word(word, _) = part { Some(word.as_str()) } else { @@ -474,7 +476,7 @@ impl PrimitiveQueryPart { /// Create primitive query from tokenized query string, /// the primitive query is an intermediate state to build the query tree. -fn create_primitive_query(query: TokenStream) -> PrimitiveQuery { +fn create_primitive_query(query: TokenStream, stop_words: Option>) -> PrimitiveQuery { let mut primitive_query = Vec::new(); let mut phrase = Vec::new(); let mut quoted = false; @@ -482,14 +484,16 @@ fn create_primitive_query(query: TokenStream) -> PrimitiveQuery { let mut peekable = query.peekable(); while let Some(token) = peekable.next() { match token.kind { - TokenKind::Word => { + TokenKind::Word | TokenKind::StopWord => { // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, - // 2. if the word is not the last token of the query we push it as a non-prefix word, + // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { phrase.push(token.word.to_string()); } else if peekable.peek().is_some() { - primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false)); + if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.word.as_ref())) { + primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false)); + } } else { primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); } @@ -563,7 +567,7 @@ mod test { query: TokenStream, ) -> anyhow::Result> { - let primitive_query = create_primitive_query(query); + let primitive_query = create_primitive_query(query, None); if !primitive_query.is_empty() { create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some) } else { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 5ad942ad6..a858aa1a9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -602,12 +602,13 @@ mod tests { assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes()); // when we search for something that is a non prefix stop_words it should be ignored + // thus we should get a placeholder search (all the results = 3) let result = index.search(&rtxn).query("the ").execute().unwrap(); - assert!(result.documents_ids.is_empty()); + assert_eq!(result.documents_ids.len(), 3); let result = index.search(&rtxn).query("i ").execute().unwrap(); - assert!(result.documents_ids.is_empty()); + assert_eq!(result.documents_ids.len(), 3); let result = index.search(&rtxn).query("are ").execute().unwrap(); - assert!(result.documents_ids.is_empty()); + assert_eq!(result.documents_ids.len(), 3); let result = index.search(&rtxn).query("dog").execute().unwrap(); assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos