From 6fa00c61d2d3c2e24fa4d4e7e70046a34f251ab6 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Tue, 13 Apr 2021 20:10:58 +0300 Subject: [PATCH] feat(search): support words_limit --- milli/src/search/mod.rs | 35 +++++++++++------- milli/src/search/query_tree.rs | 66 +++++++++++++++++++++++++--------- 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7324ea72a..174fff35c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,25 +1,27 @@ use std::borrow::Cow; -use std::collections::hash_map::{HashMap, Entry}; +use std::collections::hash_map::{Entry, HashMap}; use std::fmt; use std::mem::take; use std::str::Utf8Error; use std::time::Instant; use fst::{IntoStreamer, Streamer}; -use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder}; +use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use log::debug; -use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use crate::search::criteria::fetcher::{FetcherResult, Fetcher}; -use crate::{Index, DocumentId}; -use distinct::{MapDistinct, FacetDistinct, Distinct, DocIter, NoopDistinct}; -use self::query_tree::QueryTreeBuilder; +use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct}; -pub use self::facet::FacetIter; -pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; +use crate::search::criteria::fetcher::{Fetcher, FetcherResult}; +use crate::{DocumentId, Index}; + +pub use self::facet::{ + FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator, +}; pub use self::query_tree::MatchingWords; +use self::query_tree::QueryTreeBuilder; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -38,6 +40,7 @@ pub struct Search<'a> { limit: usize, optional_words: bool, authorize_typos: bool, + words_limit: usize, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } @@ -51,6 +54,7 @@ impl<'a> Search<'a> { limit: 20, optional_words: true, authorize_typos: true, + words_limit: 10, rtxn, index, } @@ -81,6 +85,11 @@ impl<'a> Search<'a> { self } + pub fn words_limit(&mut self, value: usize) -> &mut Search<'a> { + self.words_limit = value; + self + } + pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { self.facet_condition = Some(condition); self @@ -94,6 +103,7 @@ impl<'a> Search<'a> { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); builder.authorize_typos(self.authorize_typos); + builder.words_limit(self.words_limit); // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut config = AnalyzerConfig::default(); @@ -154,14 +164,12 @@ impl<'a> Search<'a> { matching_words: MatchingWords, mut criteria: Fetcher, ) -> anyhow::Result { - let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); let mut excluded_documents = RoaringBitmap::new(); let mut documents_ids = Vec::with_capacity(self.limit); while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { - debug!("Number of candidates found {}", candidates.len()); let excluded = take(&mut excluded_documents); @@ -195,6 +203,7 @@ impl fmt::Debug for Search<'_> { limit, optional_words, authorize_typos, + words_limit, rtxn: _, index: _, } = self; @@ -205,6 +214,7 @@ impl fmt::Debug for Search<'_> { .field("limit", limit) .field("optional_words", optional_words) .field("authorize_typos", authorize_typos) + .field("words_limit", words_limit) .finish() } } @@ -225,8 +235,7 @@ pub fn word_derivations<'c>( max_typo: u8, fst: &fst::Set>, cache: &'c mut WordDerivationsCache, -) -> Result<&'c [(String, u8)], Utf8Error> -{ +) -> Result<&'c [(String, u8)], Utf8Error> { match cache.entry((word.to_string(), is_prefix, max_typo)) { Entry::Occupied(entry) => Ok(entry.into_mut()), Entry::Vacant(entry) => { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index d21227507..492b98a1e 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -170,6 +170,7 @@ pub struct QueryTreeBuilder<'a> { index: &'a Index, optional_words: bool, authorize_typos: bool, + words_limit: Option, } impl<'a> Context for QueryTreeBuilder<'a> { @@ -190,7 +191,7 @@ impl<'a> QueryTreeBuilder<'a> { /// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn` /// and an Index `index`. pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self { - Self { rtxn, index, optional_words: true, authorize_typos: true } + Self { rtxn, index, optional_words: true, authorize_typos: true, words_limit: None } } /// if `optional_words` is set to `false` the query tree will be @@ -213,6 +214,13 @@ impl<'a> QueryTreeBuilder<'a> { self } + /// Limit words and phrases that will be taken for query building. + /// Any beyond `words_limit` will be ignored. + pub fn words_limit(&mut self, words_limit: usize) -> &mut Self { + self.words_limit = Some(words_limit); + self + } + /// Build the query tree: /// - if `optional_words` is set to `false` the query tree will be /// generated forcing all query words to be present in each matching documents @@ -222,7 +230,7 @@ impl<'a> QueryTreeBuilder<'a> { /// (the criterion `typo` will be ignored) pub fn build(&self, query: TokenStream) -> anyhow::Result> { let stop_words = self.index.stop_words(self.rtxn)?; - let primitive_query = create_primitive_query(query, stop_words); + let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some) } else { @@ -476,13 +484,18 @@ impl PrimitiveQueryPart { /// Create primitive query from tokenized query string, /// the primitive query is an intermediate state to build the query tree. -fn create_primitive_query(query: TokenStream, stop_words: Option>) -> PrimitiveQuery { +fn create_primitive_query(query: TokenStream, stop_words: Option>, words_limit: Option) -> PrimitiveQuery { let mut primitive_query = Vec::new(); let mut phrase = Vec::new(); let mut quoted = false; + let parts_limit = words_limit.unwrap_or(usize::MAX); + let mut peekable = query.peekable(); while let Some(token) = peekable.next() { + // early return if word limit is exceeded + if primitive_query.len() >= parts_limit { return primitive_query } + match token.kind { TokenKind::Word | TokenKind::StopWord => { // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, @@ -564,10 +577,11 @@ mod test { &self, optional_words: bool, authorize_typos: bool, + words_limit: Option, query: TokenStream, ) -> anyhow::Result> { - let primitive_query = create_primitive_query(query, None); + let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some) } else { @@ -660,7 +674,7 @@ mod test { Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -680,7 +694,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -711,7 +725,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -756,7 +770,7 @@ mod test { ]), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -776,7 +790,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -802,7 +816,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -822,7 +836,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -861,7 +875,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }), ]), ]); - let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -877,7 +891,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), ]); - let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -911,7 +925,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), ]), ]); - let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -930,7 +944,7 @@ mod test { ]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, false, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -943,7 +957,7 @@ mod test { let tokens = result.tokens(); let context = TestContext::default(); - let query_tree = context.build(false, true, tokens).unwrap().unwrap(); + let query_tree = context.build(false, true, None, tokens).unwrap().unwrap(); let expected = hashset!{ ("word", 0, false), @@ -967,4 +981,24 @@ mod test { let words = fetch_queries(&query_tree); assert_eq!(expected, words); } + + #[test] + fn words_limit() { + let query = "\"hey my\" good friend"; + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::And(vec![ + Operation::Consecutive(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), + ]); + + let query_tree = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } }