From 5af63c74e04251461dd022836a3e4f38ca3df52d Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 24 Feb 2021 17:44:35 +0100
Subject: [PATCH] Speed-up the MatchingWords highlighting struct

---
 http-ui/src/main.rs               |  18 ++---
 milli/src/lib.rs                  |   2 +-
 milli/src/search/criteria/typo.rs |   2 +-
 milli/src/search/mod.rs           |  51 +++++++++-----
 milli/src/search/query_tree.rs    | 111 +++++++++++++-----------------
 5 files changed, 91 insertions(+), 93 deletions(-)
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index 2ce7f8bd1..86f965368 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -32,7 +32,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
 use milli::facet::FacetValue;
 use milli::update::UpdateIndexingStep::*;
 use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
-use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
+use milli::{obkv_to_json, Index, UpdateStore, SearchResult, MatchingWords, FacetCondition};
 
 static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
 
@@ -132,7 +132,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
         Self { analyzer }
     }
 
-    fn highlight_value(&self, value: Value, words_to_highlight: &HashSet<String>) -> Value {
+    fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value {
         match value {
             Value::Null => Value::Null,
             Value::Bool(boolean) => Value::Bool(boolean),
@@ -142,7 +142,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
                 let analyzed = self.analyzer.analyze(&old_string);
                 for (word, token) in analyzed.reconstruct() {
                     if token.is_word() {
-                        let to_highlight = words_to_highlight.contains(token.text());
+                        let to_highlight = matching_words.matches(token.text());
                         if to_highlight { string.push_str("<mark>") }
                         string.push_str(word);
                         if to_highlight { string.push_str("</mark>") }
@@ -154,12 +154,12 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
             },
             Value::Array(values) => {
                 Value::Array(values.into_iter()
-                    .map(|v| self.highlight_value(v, words_to_highlight))
+                    .map(|v| self.highlight_value(v, matching_words))
                     .collect())
             },
             Value::Object(object) => {
                 Value::Object(object.into_iter()
-                    .map(|(k, v)| (k, self.highlight_value(v, words_to_highlight)))
+                    .map(|(k, v)| (k, self.highlight_value(v, matching_words)))
                     .collect())
             },
         }
@@ -168,14 +168,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
     fn highlight_record(
         &self,
         object: &mut Map<String, Value>,
-        words_to_highlight: &HashSet<String>,
+        matching_words: &MatchingWords,
         attributes_to_highlight: &HashSet<String>,
     ) {
         // TODO do we need to create a string for element that are not and needs to be highlight?
         for (key, value) in object.iter_mut() {
             if attributes_to_highlight.contains(key) {
                 let old_value = mem::take(value);
-                *value = self.highlight_value(old_value, words_to_highlight);
+                *value = self.highlight_value(old_value, matching_words);
             }
         }
     }
@@ -722,7 +722,7 @@ async fn main() -> anyhow::Result<()> {
                 search.facet_condition(condition);
             }
 
-            let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap();
+            let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap();
 
             let number_of_candidates = candidates.len();
             let facets = if query.facet_distribution == Some(true) {
@@ -748,7 +748,7 @@ async fn main() -> anyhow::Result<()> {
             for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
                 let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
                 if !disable_highlighting {
-                    highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight);
+                    highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight);
                 }
 
                 documents.push(object);
diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index 0fa966ee8..75d6f9fb3 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -28,7 +28,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
 pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
 pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
 pub use self::index::Index;
-pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult};
+pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords};
 pub use self::update_store::UpdateStore;
 
 pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs
index a48b074cc..0b8111997 100644
--- a/milli/src/search/criteria/typo.rs
+++ b/milli/src/search/criteria/typo.rs
@@ -1,8 +1,8 @@
 use std::{borrow::Cow, collections::HashMap, mem::take};
 
 use anyhow::bail;
-use roaring::RoaringBitmap;
 use log::debug;
+use roaring::RoaringBitmap;
 
 use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
 use crate::search::word_derivations;
diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs
index f3d5af2da..dbb504368 100644
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -1,10 +1,9 @@
 use std::borrow::Cow;
-use std::collections::HashSet;
 use std::fmt;
 use std::time::Instant;
 
 use fst::{IntoStreamer, Streamer, Set};
-use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
+use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder};
 use log::debug;
 use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
 use once_cell::sync::Lazy;
@@ -14,8 +13,9 @@ use crate::search::criteria::{Criterion, CriterionResult};
 use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity};
 use crate::{Index, DocumentId};
 
-pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
 pub use self::facet::FacetIter;
+pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
+pub use self::query_tree::MatchingWords;
 use self::query_tree::QueryTreeBuilder;
 
 // Building these factories is not free.
@@ -87,6 +87,11 @@ impl<'a> Search<'a> {
 
         debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed());
 
+        let matching_words = match query_tree.as_ref() {
+            Some(query_tree) => MatchingWords::from_query_tree(&query_tree),
+            None => MatchingWords::default(),
+        };
+
         // We are testing the typo criteria but there will be more of them soon.
         let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?;
         let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?;
@@ -128,8 +133,7 @@ impl<'a> Search<'a> {
             if limit == 0 { break }
         }
 
-        let found_words = HashSet::new();
-        Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids })
+        Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids })
     }
 }
 
@@ -147,26 +151,21 @@ impl fmt::Debug for Search<'_> {
 
 #[derive(Default)]
 pub struct SearchResult {
-    pub found_words: HashSet<String>,
+    pub matching_words: MatchingWords,
     pub candidates: RoaringBitmap,
     // TODO those documents ids should be associated with their criteria scores.
     pub documents_ids: Vec<DocumentId>,
 }
 
-pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set<Cow<[u8]>>) -> anyhow::Result<Vec<(String, u8)>> {
-    let lev = match max_typo {
-        0 => &LEVDIST0,
-        1 => &LEVDIST1,
-        _ => &LEVDIST2,
-    };
-
-    let dfa = if is_prefix {
-        lev.build_prefix_dfa(&word)
-    } else {
-        lev.build_dfa(&word)
-    };
-
+pub fn word_derivations(
+    word: &str,
+    is_prefix: bool,
+    max_typo: u8,
+    fst: &fst::Set<Cow<[u8]>>,
+) -> anyhow::Result<Vec<(String, u8)>>
+{
     let mut derived_words = Vec::new();
+    let dfa = build_dfa(word, max_typo, is_prefix);
     let mut stream = fst.search_with_state(&dfa).into_stream();
 
     while let Some((word, state)) = stream.next() {
@@ -177,3 +176,17 @@ pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Se
 
     Ok(derived_words)
 }
+
+pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {
+    let lev = match typos {
+        0 => &LEVDIST0,
+        1 => &LEVDIST1,
+        _ => &LEVDIST2,
+    };
+
+    if is_prefix {
+        lev.build_prefix_dfa(word)
+    } else {
+        lev.build_dfa(word)
+    }
+}
diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 47057ad10..114032eb8 100644
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -1,12 +1,13 @@
-use std::borrow::Cow;
-use std::collections::BTreeMap;
+use std::collections::HashSet;
 use std::{fmt, cmp, mem};
 
+use levenshtein_automata::{DFA, Distance};
 use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
 use roaring::RoaringBitmap;
 use slice_group_by::GroupBy;
 
 use crate::Index;
+use super::build_dfa;
 
 type IsOptionalWord = bool;
 type IsPrefix = bool;
@@ -113,6 +114,14 @@ impl QueryKind {
         QueryKind::Tolerant { typo, word }
     }
 
+    pub fn is_tolerant(&self) -> bool {
+        matches!(self, QueryKind::Tolerant { .. })
+    }
+
+    pub fn is_exact(&self) -> bool {
+        matches!(self, QueryKind::Exact { .. })
+    }
+
     pub fn typo(&self) -> u8 {
         match self {
             QueryKind::Tolerant { typo, .. } => *typo,
@@ -275,69 +284,45 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operat
 }
 
 /// The query tree builder is the interface to build a query tree.
+#[derive(Default)]
 pub struct MatchingWords {
-    inner: BTreeMap<String, IsPrefix>
+    dfas: Vec<(DFA, u8)>,
 }
 
 impl MatchingWords {
     /// List all words which can be considered as a match for the query tree.
-    pub fn from_query_tree(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> Self {
-        Self { inner: fetch_words(tree, fst).into_iter().collect() }
+    pub fn from_query_tree(tree: &Operation) -> Self {
+        Self {
+            dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect()
+        }
     }
 
     /// Return true if the word match.
-    pub fn is_match(&self, word: &str) -> bool {
-        fn first_char(s: &str) -> Option<&str> {
-            s.chars().next().map(|c| &s[..c.len_utf8()])
-        }
-
-        match first_char(word) {
-            Some(first) => {
-                let left = first.to_owned();
-                let right = word.to_owned();
-                self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word)
-            },
-            None => false
-        }
+    pub fn matches(&self, word: &str) -> bool {
+        self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) {
+            Distance::Exact(t) => t <= *typo,
+            Distance::AtLeast(_) => false,
+        })
     }
 }
 
-type FetchedWords = Vec<(String, IsPrefix)>;
-
 /// Lists all words which can be considered as a match for the query tree.
-fn fetch_words(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
-    fn resolve_branch(tree: &[Operation], fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
-        tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect()
-    }
-
-    fn resolve_query(query: &Query, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
-        match query.kind.clone() {
-            QueryKind::Exact { word, .. } => vec![(word, query.prefix)],
-            QueryKind::Tolerant { typo, word } => {
-                if let Ok(words) = super::word_derivations(&word, query.prefix, typo, fst) {
-                    words.into_iter().map(|(w, _)| (w, query.prefix)).collect()
-                } else {
-                    vec![(word, query.prefix)]
-                }
-            }
-        }
-    }
-
-    fn resolve_ops(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
+fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
+    fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
         match tree {
             Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => {
-                resolve_branch(ops.as_slice(), fst)
+                ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
             },
-            Operation::Query(ops) => {
-                resolve_query(ops, fst)
+            Operation::Query(Query { prefix, kind }) => {
+                let typo = if kind.is_exact() { 0 } else { kind.typo() };
+                out.insert((kind.word(), typo, *prefix));
             },
         }
     }
 
-    let mut words = resolve_ops(tree, fst);
-    words.sort_unstable();
-    words.dedup();
-    words
+    let mut queries = HashSet::new();
+    resolve_ops(tree, &mut queries);
+    queries
 }
 
 /// Main function that creates the final query tree from the primitive query.
@@ -559,7 +544,7 @@ mod test {
     use std::collections::HashMap;
 
     use fst::Set;
-    use maplit::hashmap;
+    use maplit::{hashmap, hashset};
     use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
     use rand::{Rng, SeedableRng, rngs::StdRng};
 
@@ -970,26 +955,26 @@ mod test {
         let context = TestContext::default();
         let query_tree = context.build(false, true, tokens).unwrap().unwrap();
 
-        let expected = vec![
-            ("city".to_string(), false),
-            ("earth".to_string(), false),
-            ("nature".to_string(), false),
-            ("new".to_string(), false),
-            ("nyc".to_string(), false),
-            ("split".to_string(), false),
-            ("word".to_string(), false),
-            ("word".to_string(), true),
-            ("world".to_string(), true),
-            ("york".to_string(), false),
-
-        ];
+        let expected = hashset!{
+            ("word",                0, false),
+            ("nyc",                 0, false),
+            ("wordsplit",           2, false),
+            ("wordsplitnycworld",   2, true),
+            ("nature",              0, false),
+            ("new",                 0, false),
+            ("city",                0, false),
+            ("world",               1, true),
+            ("york",                0, false),
+            ("split",               0, false),
+            ("nycworld",            1, true),
+            ("earth",               0, false),
+            ("wordsplitnyc",        2, false),
+        };
 
         let mut keys = context.postings.keys().collect::<Vec<_>>();
         keys.sort_unstable();
-        let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap();
-
-        let words = fetch_words(&query_tree, &set);
 
+        let words = fetch_queries(&query_tree);
         assert_eq!(expected, words);
     }
 }