mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-27 04:25:06 +08:00
Revert "Integrate the stop_words in the querytree"
This reverts commit 12fb509d84
.
We revert this commit because it's causing the bug #150.
The initial algorithm we implemented for the stop_words was:
1. remove the stop_words from the dataset
2. keep the stop_words in the query to see if we can generate new words by
integrating typos or if the word was a prefix
=> This was causing the bug since, in the case of “The hobbit”, we were
**always** looking for something starting with “t he” or “th e”
instead of ignoring the word completely.
For now we are going to fix the bug by completely ignoring the
stop_words in the query.
This could cause another problem were someone mistyped a normal word and
ended up typing a stop_word.
For example imagine someone searching for the music “Won't he do it”.
If that person misplace one space and write “Won' the do it” then we
will loose a part of the request.
One fix would be to update our query tree to something like that:
---------------------
OR
OR
TOLERANT hobbit # the first option is to ignore the stop_word
AND
CONSECUTIVE # the second option is to do as we are doing
EXACT t # currently
EXACT he
TOLERANT hobbit
---------------------
This would increase drastically the size of our query tree on request
with a lot of stop_words. For example think of “The Lord Of The Rings”.
For now whatsoever we decided we were going to ignore this problem and consider
that it doesn't reduce too much the relevancy of the search to do that
while it improves the performances.
This commit is contained in:
parent
f9eab6e0de
commit
da036dcc3e
@ -1,7 +1,6 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::{fmt, cmp, mem};
|
use std::{fmt, cmp, mem};
|
||||||
|
|
||||||
use fst::Set;
|
|
||||||
use levenshtein_automata::{DFA, Distance};
|
use levenshtein_automata::{DFA, Distance};
|
||||||
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
|
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -155,10 +154,6 @@ impl fmt::Debug for Query {
|
|||||||
|
|
||||||
trait Context {
|
trait Context {
|
||||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>>;
|
|
||||||
fn is_stop_word(&self, word: &str) -> anyhow::Result<bool> {
|
|
||||||
Ok(self.stop_words()?.map_or(false, |s| s.contains(word)))
|
|
||||||
}
|
|
||||||
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
||||||
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
||||||
match self.word_docids(word)? {
|
match self.word_docids(word)? {
|
||||||
@ -188,10 +183,6 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
|||||||
fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> {
|
|
||||||
self.index.stop_words(self.rtxn)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> QueryTreeBuilder<'a> {
|
impl<'a> QueryTreeBuilder<'a> {
|
||||||
@ -340,7 +331,8 @@ fn create_query_tree(
|
|||||||
optional_words: bool,
|
optional_words: bool,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
query: PrimitiveQuery,
|
query: PrimitiveQuery,
|
||||||
) -> anyhow::Result<Operation> {
|
) -> anyhow::Result<Operation>
|
||||||
|
{
|
||||||
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
|
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
|
||||||
fn resolve_primitive_part(
|
fn resolve_primitive_part(
|
||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
@ -358,12 +350,7 @@ fn create_query_tree(
|
|||||||
if let Some(child) = split_best_frequency(ctx, &word)? {
|
if let Some(child) = split_best_frequency(ctx, &word)? {
|
||||||
children.push(child);
|
children.push(child);
|
||||||
}
|
}
|
||||||
|
children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) }));
|
||||||
let is_stop_word = ctx.is_stop_word(&word)?;
|
|
||||||
let query = Query { prefix, kind: typos(word, authorize_typos) };
|
|
||||||
if query.prefix || query.kind.is_tolerant() || !is_stop_word {
|
|
||||||
children.push(Operation::Query(query));
|
|
||||||
}
|
|
||||||
Ok(Operation::or(false, children))
|
Ok(Operation::or(false, children))
|
||||||
},
|
},
|
||||||
// create a CONSECUTIVE operation wrapping all word in the phrase
|
// create a CONSECUTIVE operation wrapping all word in the phrase
|
||||||
@ -378,11 +365,12 @@ fn create_query_tree(
|
|||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
query: &[PrimitiveQueryPart],
|
query: &[PrimitiveQueryPart],
|
||||||
) -> anyhow::Result<Operation> {
|
) -> anyhow::Result<Operation>
|
||||||
|
{
|
||||||
const MAX_NGRAM: usize = 3;
|
const MAX_NGRAM: usize = 3;
|
||||||
let mut op_children = Vec::new();
|
let mut op_children = Vec::new();
|
||||||
|
|
||||||
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
|
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) {
|
||||||
let mut or_op_children = Vec::new();
|
let mut or_op_children = Vec::new();
|
||||||
|
|
||||||
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
|
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
|
||||||
@ -393,33 +381,25 @@ fn create_query_tree(
|
|||||||
|
|
||||||
match group {
|
match group {
|
||||||
[part] => {
|
[part] => {
|
||||||
let operation =
|
let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?;
|
||||||
resolve_primitive_part(ctx, authorize_typos, part.clone())?;
|
|
||||||
and_op_children.push(operation);
|
and_op_children.push(operation);
|
||||||
}
|
},
|
||||||
words => {
|
words => {
|
||||||
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
|
let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false);
|
||||||
let words: Vec<_> = words
|
let words: Vec<_> = words.iter().filter_map(| part| {
|
||||||
.iter()
|
|
||||||
.filter_map(|part| {
|
|
||||||
if let PrimitiveQueryPart::Word(word, _) = part {
|
if let PrimitiveQueryPart::Word(word, _) = part {
|
||||||
Some(word.as_str())
|
Some(word.as_str())
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
})
|
}).collect();
|
||||||
.collect();
|
|
||||||
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
|
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
|
||||||
let concat = words.concat();
|
let concat = words.concat();
|
||||||
|
|
||||||
let is_stop_word = ctx.is_stop_word(&concat)?;
|
|
||||||
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
|
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
|
||||||
if query.prefix || query.kind.is_tolerant() || !is_stop_word {
|
|
||||||
operations.push(Operation::Query(query));
|
operations.push(Operation::Query(query));
|
||||||
and_op_children.push(Operation::or(false, operations));
|
and_op_children.push(Operation::or(false, operations));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if !is_last {
|
if !is_last {
|
||||||
let ngrams = ngrams(ctx, authorize_typos, tail)?;
|
let ngrams = ngrams(ctx, authorize_typos, tail)?;
|
||||||
@ -601,10 +581,6 @@ mod test {
|
|||||||
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
||||||
Ok(self.synonyms.get(&words).cloned())
|
Ok(self.synonyms.get(&words).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> {
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for TestContext {
|
impl Default for TestContext {
|
||||||
|
Loading…
Reference in New Issue
Block a user