From 5af63c74e04251461dd022836a3e4f38ca3df52d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 24 Feb 2021 17:44:35 +0100 Subject: [PATCH] Speed-up the MatchingWords highlighting struct --- http-ui/src/main.rs | 18 ++--- milli/src/lib.rs | 2 +- milli/src/search/criteria/typo.rs | 2 +- milli/src/search/mod.rs | 51 +++++++++----- milli/src/search/query_tree.rs | 111 +++++++++++++----------------- 5 files changed, 91 insertions(+), 93 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 2ce7f8bd1..86f965368 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -32,7 +32,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use milli::facet::FacetValue; use milli::update::UpdateIndexingStep::*; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; -use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; +use milli::{obkv_to_json, Index, UpdateStore, SearchResult, MatchingWords, FacetCondition}; static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); @@ -132,7 +132,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { Self { analyzer } } - fn highlight_value(&self, value: Value, words_to_highlight: &HashSet) -> Value { + fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value { match value { Value::Null => Value::Null, Value::Bool(boolean) => Value::Bool(boolean), @@ -142,7 +142,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed = self.analyzer.analyze(&old_string); for (word, token) in analyzed.reconstruct() { if token.is_word() { - let to_highlight = words_to_highlight.contains(token.text()); + let to_highlight = matching_words.matches(token.text()); if to_highlight { string.push_str("") } string.push_str(word); if to_highlight { string.push_str("") } @@ -154,12 +154,12 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { }, Value::Array(values) => { Value::Array(values.into_iter() - .map(|v| self.highlight_value(v, words_to_highlight)) + .map(|v| self.highlight_value(v, matching_words)) .collect()) }, Value::Object(object) => { Value::Object(object.into_iter() - .map(|(k, v)| (k, self.highlight_value(v, words_to_highlight))) + .map(|(k, v)| (k, self.highlight_value(v, matching_words))) .collect()) }, } @@ -168,14 +168,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { fn highlight_record( &self, object: &mut Map, - words_to_highlight: &HashSet, + matching_words: &MatchingWords, attributes_to_highlight: &HashSet, ) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { let old_value = mem::take(value); - *value = self.highlight_value(old_value, words_to_highlight); + *value = self.highlight_value(old_value, matching_words); } } } @@ -722,7 +722,7 @@ async fn main() -> anyhow::Result<()> { search.facet_condition(condition); } - let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap(); + let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); let number_of_candidates = candidates.len(); let facets = if query.facet_distribution == Some(true) { @@ -748,7 +748,7 @@ async fn main() -> anyhow::Result<()> { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { - highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight); + highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight); } documents.push(object); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 0fa966ee8..75d6f9fb3 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -28,7 +28,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; -pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult}; +pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a48b074cc..0b8111997 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -1,8 +1,8 @@ use std::{borrow::Cow, collections::HashMap, mem::take}; use anyhow::bail; -use roaring::RoaringBitmap; use log::debug; +use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::word_derivations; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f3d5af2da..dbb504368 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,10 +1,9 @@ use std::borrow::Cow; -use std::collections::HashSet; use std::fmt; use std::time::Instant; use fst::{IntoStreamer, Streamer, Set}; -use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; +use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder}; use log::debug; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; @@ -14,8 +13,9 @@ use crate::search::criteria::{Criterion, CriterionResult}; use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; use crate::{Index, DocumentId}; -pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::FacetIter; +pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; +pub use self::query_tree::MatchingWords; use self::query_tree::QueryTreeBuilder; // Building these factories is not free. @@ -87,6 +87,11 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); + let matching_words = match query_tree.as_ref() { + Some(query_tree) => MatchingWords::from_query_tree(&query_tree), + None => MatchingWords::default(), + }; + // We are testing the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; @@ -128,8 +133,7 @@ impl<'a> Search<'a> { if limit == 0 { break } } - let found_words = HashSet::new(); - Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids }) + Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) } } @@ -147,26 +151,21 @@ impl fmt::Debug for Search<'_> { #[derive(Default)] pub struct SearchResult { - pub found_words: HashSet, + pub matching_words: MatchingWords, pub candidates: RoaringBitmap, // TODO those documents ids should be associated with their criteria scores. pub documents_ids: Vec, } -pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { - let lev = match max_typo { - 0 => &LEVDIST0, - 1 => &LEVDIST1, - _ => &LEVDIST2, - }; - - let dfa = if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - }; - +pub fn word_derivations( + word: &str, + is_prefix: bool, + max_typo: u8, + fst: &fst::Set>, +) -> anyhow::Result> +{ let mut derived_words = Vec::new(); + let dfa = build_dfa(word, max_typo, is_prefix); let mut stream = fst.search_with_state(&dfa).into_stream(); while let Some((word, state)) = stream.next() { @@ -177,3 +176,17 @@ pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Se Ok(derived_words) } + +pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { + let lev = match typos { + 0 => &LEVDIST0, + 1 => &LEVDIST1, + _ => &LEVDIST2, + }; + + if is_prefix { + lev.build_prefix_dfa(word) + } else { + lev.build_dfa(word) + } +} diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 47057ad10..114032eb8 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,12 +1,13 @@ -use std::borrow::Cow; -use std::collections::BTreeMap; +use std::collections::HashSet; use std::{fmt, cmp, mem}; +use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; use slice_group_by::GroupBy; use crate::Index; +use super::build_dfa; type IsOptionalWord = bool; type IsPrefix = bool; @@ -113,6 +114,14 @@ impl QueryKind { QueryKind::Tolerant { typo, word } } + pub fn is_tolerant(&self) -> bool { + matches!(self, QueryKind::Tolerant { .. }) + } + + pub fn is_exact(&self) -> bool { + matches!(self, QueryKind::Exact { .. }) + } + pub fn typo(&self) -> u8 { match self { QueryKind::Tolerant { typo, .. } => *typo, @@ -275,69 +284,45 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result + dfas: Vec<(DFA, u8)>, } impl MatchingWords { /// List all words which can be considered as a match for the query tree. - pub fn from_query_tree(tree: &Operation, fst: &fst::Set>) -> Self { - Self { inner: fetch_words(tree, fst).into_iter().collect() } + pub fn from_query_tree(tree: &Operation) -> Self { + Self { + dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect() + } } /// Return true if the word match. - pub fn is_match(&self, word: &str) -> bool { - fn first_char(s: &str) -> Option<&str> { - s.chars().next().map(|c| &s[..c.len_utf8()]) - } - - match first_char(word) { - Some(first) => { - let left = first.to_owned(); - let right = word.to_owned(); - self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word) - }, - None => false - } + pub fn matches(&self, word: &str) -> bool { + self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) { + Distance::Exact(t) => t <= *typo, + Distance::AtLeast(_) => false, + }) } } -type FetchedWords = Vec<(String, IsPrefix)>; - /// Lists all words which can be considered as a match for the query tree. -fn fetch_words(tree: &Operation, fst: &fst::Set>) -> FetchedWords { - fn resolve_branch(tree: &[Operation], fst: &fst::Set>) -> FetchedWords { - tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect() - } - - fn resolve_query(query: &Query, fst: &fst::Set>) -> FetchedWords { - match query.kind.clone() { - QueryKind::Exact { word, .. } => vec![(word, query.prefix)], - QueryKind::Tolerant { typo, word } => { - if let Ok(words) = super::word_derivations(&word, query.prefix, typo, fst) { - words.into_iter().map(|(w, _)| (w, query.prefix)).collect() - } else { - vec![(word, query.prefix)] - } - } - } - } - - fn resolve_ops(tree: &Operation, fst: &fst::Set>) -> FetchedWords { +fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { + fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { match tree { Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { - resolve_branch(ops.as_slice(), fst) + ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); }, - Operation::Query(ops) => { - resolve_query(ops, fst) + Operation::Query(Query { prefix, kind }) => { + let typo = if kind.is_exact() { 0 } else { kind.typo() }; + out.insert((kind.word(), typo, *prefix)); }, } } - let mut words = resolve_ops(tree, fst); - words.sort_unstable(); - words.dedup(); - words + let mut queries = HashSet::new(); + resolve_ops(tree, &mut queries); + queries } /// Main function that creates the final query tree from the primitive query. @@ -559,7 +544,7 @@ mod test { use std::collections::HashMap; use fst::Set; - use maplit::hashmap; + use maplit::{hashmap, hashset}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::{Rng, SeedableRng, rngs::StdRng}; @@ -970,26 +955,26 @@ mod test { let context = TestContext::default(); let query_tree = context.build(false, true, tokens).unwrap().unwrap(); - let expected = vec![ - ("city".to_string(), false), - ("earth".to_string(), false), - ("nature".to_string(), false), - ("new".to_string(), false), - ("nyc".to_string(), false), - ("split".to_string(), false), - ("word".to_string(), false), - ("word".to_string(), true), - ("world".to_string(), true), - ("york".to_string(), false), - - ]; + let expected = hashset!{ + ("word", 0, false), + ("nyc", 0, false), + ("wordsplit", 2, false), + ("wordsplitnycworld", 2, true), + ("nature", 0, false), + ("new", 0, false), + ("city", 0, false), + ("world", 1, true), + ("york", 0, false), + ("split", 0, false), + ("nycworld", 1, true), + ("earth", 0, false), + ("wordsplitnyc", 2, false), + }; let mut keys = context.postings.keys().collect::>(); keys.sort_unstable(); - let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap(); - - let words = fetch_words(&query_tree, &set); + let words = fetch_queries(&query_tree); assert_eq!(expected, words); } }