mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-27 04:25:06 +08:00
Merge pull request #160 from shekhirin/query-words-limit
Support query words limit
This commit is contained in:
commit
28a8df2f0a
@ -1,25 +1,27 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::hash_map::{HashMap, Entry};
|
use std::collections::hash_map::{Entry, HashMap};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::mem::take;
|
use std::mem::take;
|
||||||
use std::str::Utf8Error;
|
use std::str::Utf8Error;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder};
|
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
|
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use roaring::bitmap::RoaringBitmap;
|
use roaring::bitmap::RoaringBitmap;
|
||||||
|
|
||||||
use crate::search::criteria::fetcher::{FetcherResult, Fetcher};
|
use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct};
|
||||||
use crate::{Index, DocumentId};
|
|
||||||
use distinct::{MapDistinct, FacetDistinct, Distinct, DocIter, NoopDistinct};
|
|
||||||
use self::query_tree::QueryTreeBuilder;
|
|
||||||
|
|
||||||
pub use self::facet::FacetIter;
|
use crate::search::criteria::fetcher::{Fetcher, FetcherResult};
|
||||||
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
|
use crate::{DocumentId, Index};
|
||||||
|
|
||||||
|
pub use self::facet::{
|
||||||
|
FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator,
|
||||||
|
};
|
||||||
pub use self::query_tree::MatchingWords;
|
pub use self::query_tree::MatchingWords;
|
||||||
|
use self::query_tree::QueryTreeBuilder;
|
||||||
|
|
||||||
// Building these factories is not free.
|
// Building these factories is not free.
|
||||||
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||||
@ -38,6 +40,7 @@ pub struct Search<'a> {
|
|||||||
limit: usize,
|
limit: usize,
|
||||||
optional_words: bool,
|
optional_words: bool,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
|
words_limit: usize,
|
||||||
rtxn: &'a heed::RoTxn<'a>,
|
rtxn: &'a heed::RoTxn<'a>,
|
||||||
index: &'a Index,
|
index: &'a Index,
|
||||||
}
|
}
|
||||||
@ -51,6 +54,7 @@ impl<'a> Search<'a> {
|
|||||||
limit: 20,
|
limit: 20,
|
||||||
optional_words: true,
|
optional_words: true,
|
||||||
authorize_typos: true,
|
authorize_typos: true,
|
||||||
|
words_limit: 10,
|
||||||
rtxn,
|
rtxn,
|
||||||
index,
|
index,
|
||||||
}
|
}
|
||||||
@ -81,6 +85,11 @@ impl<'a> Search<'a> {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn words_limit(&mut self, value: usize) -> &mut Search<'a> {
|
||||||
|
self.words_limit = value;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> {
|
pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> {
|
||||||
self.facet_condition = Some(condition);
|
self.facet_condition = Some(condition);
|
||||||
self
|
self
|
||||||
@ -94,6 +103,7 @@ impl<'a> Search<'a> {
|
|||||||
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
|
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
|
||||||
builder.optional_words(self.optional_words);
|
builder.optional_words(self.optional_words);
|
||||||
builder.authorize_typos(self.authorize_typos);
|
builder.authorize_typos(self.authorize_typos);
|
||||||
|
builder.words_limit(self.words_limit);
|
||||||
// We make sure that the analyzer is aware of the stop words
|
// We make sure that the analyzer is aware of the stop words
|
||||||
// this ensures that the query builder is able to properly remove them.
|
// this ensures that the query builder is able to properly remove them.
|
||||||
let mut config = AnalyzerConfig::default();
|
let mut config = AnalyzerConfig::default();
|
||||||
@ -154,14 +164,12 @@ impl<'a> Search<'a> {
|
|||||||
matching_words: MatchingWords,
|
matching_words: MatchingWords,
|
||||||
mut criteria: Fetcher,
|
mut criteria: Fetcher,
|
||||||
) -> anyhow::Result<SearchResult> {
|
) -> anyhow::Result<SearchResult> {
|
||||||
|
|
||||||
let mut offset = self.offset;
|
let mut offset = self.offset;
|
||||||
let mut initial_candidates = RoaringBitmap::new();
|
let mut initial_candidates = RoaringBitmap::new();
|
||||||
let mut excluded_documents = RoaringBitmap::new();
|
let mut excluded_documents = RoaringBitmap::new();
|
||||||
let mut documents_ids = Vec::with_capacity(self.limit);
|
let mut documents_ids = Vec::with_capacity(self.limit);
|
||||||
|
|
||||||
while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? {
|
while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? {
|
||||||
|
|
||||||
debug!("Number of candidates found {}", candidates.len());
|
debug!("Number of candidates found {}", candidates.len());
|
||||||
|
|
||||||
let excluded = take(&mut excluded_documents);
|
let excluded = take(&mut excluded_documents);
|
||||||
@ -195,6 +203,7 @@ impl fmt::Debug for Search<'_> {
|
|||||||
limit,
|
limit,
|
||||||
optional_words,
|
optional_words,
|
||||||
authorize_typos,
|
authorize_typos,
|
||||||
|
words_limit,
|
||||||
rtxn: _,
|
rtxn: _,
|
||||||
index: _,
|
index: _,
|
||||||
} = self;
|
} = self;
|
||||||
@ -205,6 +214,7 @@ impl fmt::Debug for Search<'_> {
|
|||||||
.field("limit", limit)
|
.field("limit", limit)
|
||||||
.field("optional_words", optional_words)
|
.field("optional_words", optional_words)
|
||||||
.field("authorize_typos", authorize_typos)
|
.field("authorize_typos", authorize_typos)
|
||||||
|
.field("words_limit", words_limit)
|
||||||
.finish()
|
.finish()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -225,8 +235,7 @@ pub fn word_derivations<'c>(
|
|||||||
max_typo: u8,
|
max_typo: u8,
|
||||||
fst: &fst::Set<Cow<[u8]>>,
|
fst: &fst::Set<Cow<[u8]>>,
|
||||||
cache: &'c mut WordDerivationsCache,
|
cache: &'c mut WordDerivationsCache,
|
||||||
) -> Result<&'c [(String, u8)], Utf8Error>
|
) -> Result<&'c [(String, u8)], Utf8Error> {
|
||||||
{
|
|
||||||
match cache.entry((word.to_string(), is_prefix, max_typo)) {
|
match cache.entry((word.to_string(), is_prefix, max_typo)) {
|
||||||
Entry::Occupied(entry) => Ok(entry.into_mut()),
|
Entry::Occupied(entry) => Ok(entry.into_mut()),
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
|
@ -170,6 +170,7 @@ pub struct QueryTreeBuilder<'a> {
|
|||||||
index: &'a Index,
|
index: &'a Index,
|
||||||
optional_words: bool,
|
optional_words: bool,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
|
words_limit: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Context for QueryTreeBuilder<'a> {
|
impl<'a> Context for QueryTreeBuilder<'a> {
|
||||||
@ -190,7 +191,7 @@ impl<'a> QueryTreeBuilder<'a> {
|
|||||||
/// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn`
|
/// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn`
|
||||||
/// and an Index `index`.
|
/// and an Index `index`.
|
||||||
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self {
|
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self {
|
||||||
Self { rtxn, index, optional_words: true, authorize_typos: true }
|
Self { rtxn, index, optional_words: true, authorize_typos: true, words_limit: None }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// if `optional_words` is set to `false` the query tree will be
|
/// if `optional_words` is set to `false` the query tree will be
|
||||||
@ -213,6 +214,13 @@ impl<'a> QueryTreeBuilder<'a> {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Limit words and phrases that will be taken for query building.
|
||||||
|
/// Any beyond `words_limit` will be ignored.
|
||||||
|
pub fn words_limit(&mut self, words_limit: usize) -> &mut Self {
|
||||||
|
self.words_limit = Some(words_limit);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
/// Build the query tree:
|
/// Build the query tree:
|
||||||
/// - if `optional_words` is set to `false` the query tree will be
|
/// - if `optional_words` is set to `false` the query tree will be
|
||||||
/// generated forcing all query words to be present in each matching documents
|
/// generated forcing all query words to be present in each matching documents
|
||||||
@ -222,7 +230,7 @@ impl<'a> QueryTreeBuilder<'a> {
|
|||||||
/// (the criterion `typo` will be ignored)
|
/// (the criterion `typo` will be ignored)
|
||||||
pub fn build(&self, query: TokenStream) -> anyhow::Result<Option<Operation>> {
|
pub fn build(&self, query: TokenStream) -> anyhow::Result<Option<Operation>> {
|
||||||
let stop_words = self.index.stop_words(self.rtxn)?;
|
let stop_words = self.index.stop_words(self.rtxn)?;
|
||||||
let primitive_query = create_primitive_query(query, stop_words);
|
let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
|
||||||
if !primitive_query.is_empty() {
|
if !primitive_query.is_empty() {
|
||||||
create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some)
|
create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some)
|
||||||
} else {
|
} else {
|
||||||
@ -476,13 +484,18 @@ impl PrimitiveQueryPart {
|
|||||||
|
|
||||||
/// Create primitive query from tokenized query string,
|
/// Create primitive query from tokenized query string,
|
||||||
/// the primitive query is an intermediate state to build the query tree.
|
/// the primitive query is an intermediate state to build the query tree.
|
||||||
fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>) -> PrimitiveQuery {
|
fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, words_limit: Option<usize>) -> PrimitiveQuery {
|
||||||
let mut primitive_query = Vec::new();
|
let mut primitive_query = Vec::new();
|
||||||
let mut phrase = Vec::new();
|
let mut phrase = Vec::new();
|
||||||
let mut quoted = false;
|
let mut quoted = false;
|
||||||
|
|
||||||
|
let parts_limit = words_limit.unwrap_or(usize::MAX);
|
||||||
|
|
||||||
let mut peekable = query.peekable();
|
let mut peekable = query.peekable();
|
||||||
while let Some(token) = peekable.next() {
|
while let Some(token) = peekable.next() {
|
||||||
|
// early return if word limit is exceeded
|
||||||
|
if primitive_query.len() >= parts_limit { return primitive_query }
|
||||||
|
|
||||||
match token.kind {
|
match token.kind {
|
||||||
TokenKind::Word | TokenKind::StopWord => {
|
TokenKind::Word | TokenKind::StopWord => {
|
||||||
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
|
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
|
||||||
@ -564,10 +577,11 @@ mod test {
|
|||||||
&self,
|
&self,
|
||||||
optional_words: bool,
|
optional_words: bool,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
|
words_limit: Option<usize>,
|
||||||
query: TokenStream,
|
query: TokenStream,
|
||||||
) -> anyhow::Result<Option<Operation>>
|
) -> anyhow::Result<Option<Operation>>
|
||||||
{
|
{
|
||||||
let primitive_query = create_primitive_query(query, None);
|
let primitive_query = create_primitive_query(query, None, words_limit);
|
||||||
if !primitive_query.is_empty() {
|
if !primitive_query.is_empty() {
|
||||||
create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some)
|
create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some)
|
||||||
} else {
|
} else {
|
||||||
@ -660,7 +674,7 @@ mod test {
|
|||||||
Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }),
|
Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -680,7 +694,7 @@ mod test {
|
|||||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -711,7 +725,7 @@ mod test {
|
|||||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -756,7 +770,7 @@ mod test {
|
|||||||
]),
|
]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -776,7 +790,7 @@ mod test {
|
|||||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -802,7 +816,7 @@ mod test {
|
|||||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -822,7 +836,7 @@ mod test {
|
|||||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -861,7 +875,7 @@ mod test {
|
|||||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }),
|
||||||
]),
|
]),
|
||||||
]);
|
]);
|
||||||
let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -877,7 +891,7 @@ mod test {
|
|||||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
||||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
|
||||||
]);
|
]);
|
||||||
let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -911,7 +925,7 @@ mod test {
|
|||||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }),
|
||||||
]),
|
]),
|
||||||
]);
|
]);
|
||||||
let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -930,7 +944,7 @@ mod test {
|
|||||||
]),
|
]),
|
||||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }),
|
||||||
]);
|
]);
|
||||||
let query_tree = TestContext::default().build(false, false, tokens).unwrap().unwrap();
|
let query_tree = TestContext::default().build(false, false, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
assert_eq!(expected, query_tree);
|
assert_eq!(expected, query_tree);
|
||||||
}
|
}
|
||||||
@ -943,7 +957,7 @@ mod test {
|
|||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
let context = TestContext::default();
|
let context = TestContext::default();
|
||||||
let query_tree = context.build(false, true, tokens).unwrap().unwrap();
|
let query_tree = context.build(false, true, None, tokens).unwrap().unwrap();
|
||||||
|
|
||||||
let expected = hashset!{
|
let expected = hashset!{
|
||||||
("word", 0, false),
|
("word", 0, false),
|
||||||
@ -967,4 +981,24 @@ mod test {
|
|||||||
let words = fetch_queries(&query_tree);
|
let words = fetch_queries(&query_tree);
|
||||||
assert_eq!(expected, words);
|
assert_eq!(expected, words);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn words_limit() {
|
||||||
|
let query = "\"hey my\" good friend";
|
||||||
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
|
let result = analyzer.analyze(query);
|
||||||
|
let tokens = result.tokens();
|
||||||
|
|
||||||
|
let expected = Operation::And(vec![
|
||||||
|
Operation::Consecutive(vec![
|
||||||
|
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
||||||
|
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
|
||||||
|
]),
|
||||||
|
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let query_tree = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap();
|
||||||
|
|
||||||
|
assert_eq!(expected, query_tree);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user