From 00336c5154c8e1bb2f08ca88f0c09fa130541d00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Thu, 16 Jan 2020 14:24:45 +0100
Subject: [PATCH] Reintroduce a basic highlight display

---
 meilisearch-core/src/bucket_sort.rs   | 357 +-------------------------
 meilisearch-core/src/criterion/mod.rs |   2 +-
 meilisearch-core/src/lib.rs           |  45 ++--
 meilisearch-core/src/query_tree.rs    |   3 +-
 meilisearch-core/src/raw_document.rs  |   2 +-
 5 files changed, 23 insertions(+), 386 deletions(-)
diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs
index bd3aac6fd..413e9c732 100644
--- a/meilisearch-core/src/bucket_sort.rs
+++ b/meilisearch-core/src/bucket_sort.rs
@@ -147,7 +147,7 @@ where
                         document_id: *id,
                         query_index: query.id,
                         distance,
-                        is_exact: true, // TODO where can I find this info?
+                        is_exact,
                         postings_list: posting_list_index,
                     };
 
@@ -384,358 +384,3 @@ impl Deref for PostingsListView<'_> {
         }
     }
 }
-
-fn fetch_matches<'txn, 'tag>(
-    reader: &'txn heed::RoTxn<MainT>,
-    automatons: &[QueryWordAutomaton],
-    arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
-    main_store: store::Main,
-    postings_lists_store: store::PostingsLists,
-    pplc_store: store::PrefixPostingsListsCache,
-) -> MResult<Vec<BareMatch<'tag>>>
-{
-    let before_words_fst = Instant::now();
-    let words = match unsafe { main_store.static_words_fst(reader)? } {
-        Some(words) => words,
-        None => return Ok(Vec::new()),
-    };
-    debug!("words fst took {:.02?}", before_words_fst.elapsed());
-    debug!("words fst len {} and size {}", words.len(), words.as_fst().as_bytes().len());
-
-    let mut total_postings_lists = Vec::new();
-    let mut documents_ids = HashSet::<DocumentId>::new();
-
-    let mut dfa_time = Duration::default();
-    let mut postings_lists_fetching_time = Duration::default();
-    let automatons_loop = Instant::now();
-
-    for (query_index, automaton) in automatons.iter().enumerate() {
-        let QueryWordAutomaton { query, is_exact, is_prefix, .. } = automaton;
-
-        let before_word_postings_lists_fetching = Instant::now();
-        let mut stream_next_time = Duration::default();
-        let mut number_of_words = 0;
-        let mut postings_lists_original_length = 0;
-        let mut postings_lists_length = 0;
-
-        if *is_prefix && query.len() == 1 {
-            let prefix = [query.as_bytes()[0], 0, 0, 0];
-
-            number_of_words += 1;
-
-            let before_postings_lists_fetching = Instant::now();
-            if let Some(postings) = pplc_store.prefix_postings_list(reader, prefix)? {
-                debug!("Found cached postings list for {:?}", query);
-                postings_lists_original_length += postings.matches.len();
-
-                let input = Rc::from(&prefix[..]);
-                let postings_list = Rc::new(postings.matches);
-                let postings_list_view = PostingsListView::original(input, postings_list);
-
-                let mut offset = 0;
-                for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
-                    let document_id = group[0].document_id;
-
-                    if query_index != 0 && !documents_ids.contains(&document_id) {
-                        offset += group.len();
-                        continue
-                    }
-                    documents_ids.insert(document_id);
-
-                    postings_lists_length += group.len();
-
-                    let range = postings_list_view.range(offset, group.len());
-                    let posting_list_index = arena.add(range);
-                    let bare_match = BareMatch {
-                        document_id,
-                        query_index,
-                        distance: 0,
-                        is_exact: *is_exact,
-                        postings_list: posting_list_index,
-                    };
-
-
-                    total_postings_lists.push(bare_match);
-                    offset += group.len();
-                }
-            }
-            postings_lists_fetching_time += before_postings_lists_fetching.elapsed();
-        }
-        else {
-            let before_dfa = Instant::now();
-            let dfa = automaton.dfa();
-            dfa_time += before_dfa.elapsed();
-
-            let byte = query.as_bytes()[0];
-            let mut stream = if byte == u8::max_value() {
-                words.search(&dfa).ge(&[byte]).into_stream()
-            } else {
-                words.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
-            };
-
-            // while let Some(input) = stream.next() {
-            loop {
-                let before_stream_next = Instant::now();
-                let value = stream.next();
-                stream_next_time += before_stream_next.elapsed();
-
-                let input = match value {
-                    Some(input) => input,
-                    None => break,
-                };
-
-                number_of_words += 1;
-
-                let distance = dfa.eval(input).to_u8();
-                let is_exact = *is_exact && distance == 0 && input.len() == query.len();
-
-                let before_postings_lists_fetching = Instant::now();
-                if let Some(Postings { docids, matches }) = postings_lists_store.postings_list(reader, input)? {
-                    postings_lists_original_length += matches.len();
-
-                    let input = Rc::from(input);
-                    let matches = Rc::new(matches);
-                    let postings_list_view = PostingsListView::original(input, matches);
-
-                    let mut offset = 0;
-                    for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
-                        let document_id = group[0].document_id;
-
-                        if query_index != 0 && !documents_ids.contains(&document_id) {
-                            offset += group.len();
-                            continue
-                        }
-                        documents_ids.insert(document_id);
-
-                        postings_lists_length += group.len();
-
-                        let range = postings_list_view.range(offset, group.len());
-                        let posting_list_index = arena.add(range);
-                        let bare_match = BareMatch {
-                            document_id,
-                            query_index,
-                            distance,
-                            is_exact,
-                            postings_list: posting_list_index,
-                        };
-
-                        total_postings_lists.push(bare_match);
-                        offset += group.len();
-                    }
-                }
-                postings_lists_fetching_time += before_postings_lists_fetching.elapsed();
-            }
-        }
-
-        debug!("{:?} gives {} words", query, number_of_words);
-        debug!("{:?} gives postings lists of length {} (original was {})",
-            query, postings_lists_length, postings_lists_original_length);
-        debug!("{:?} took {:.02?} to fetch postings lists",
-            query, before_word_postings_lists_fetching.elapsed());
-        debug!("stream next took {:.02?}", stream_next_time);
-    }
-
-    debug!("automatons loop took {:.02?}", automatons_loop.elapsed());
-    debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time);
-    debug!("dfa creation took {:.02?}", dfa_time);
-
-    Ok(total_postings_lists)
-}
-
-#[derive(Debug)]
-pub struct QueryWordAutomaton {
-    pub query: String,
-    /// Is it a word that must be considered exact
-    /// or is it some derived word (i.e. a synonym)
-    pub is_exact: bool,
-    pub is_prefix: bool,
-    /// If it's a phrase query and what is
-    /// its index an the length of the phrase
-    pub phrase_query: Option<(u16, u16)>,
-}
-
-impl QueryWordAutomaton {
-    pub fn exact(query: &str) -> QueryWordAutomaton {
-        QueryWordAutomaton {
-            query: query.to_string(),
-            is_exact: true,
-            is_prefix: false,
-            phrase_query: None,
-        }
-    }
-
-    pub fn exact_prefix(query: &str) -> QueryWordAutomaton {
-        QueryWordAutomaton {
-            query: query.to_string(),
-            is_exact: true,
-            is_prefix: true,
-            phrase_query: None,
-        }
-    }
-
-    pub fn non_exact(query: &str) -> QueryWordAutomaton {
-        QueryWordAutomaton {
-            query: query.to_string(),
-            is_exact: false,
-            is_prefix: false,
-            phrase_query: None,
-        }
-    }
-
-    pub fn dfa(&self) -> DFA {
-        if self.phrase_query.is_some() {
-            build_exact_dfa(&self.query)
-        } else if self.is_prefix {
-            build_prefix_dfa(&self.query)
-        } else {
-            build_dfa(&self.query)
-        }
-    }
-}
-
-fn split_best_frequency<'a>(
-    reader: &heed::RoTxn<MainT>,
-    word: &'a str,
-    postings_lists_store: store::PostingsLists,
-) -> MResult<Option<(&'a str, &'a str)>> {
-    let chars = word.char_indices().skip(1);
-    let mut best = None;
-
-    for (i, _) in chars {
-        let (left, right) = word.split_at(i);
-
-        let left_freq = postings_lists_store
-            .postings_list(reader, left.as_ref())?
-            .map_or(0, |p| p.docids.len());
-
-        let right_freq = postings_lists_store
-            .postings_list(reader, right.as_ref())?
-            .map_or(0, |p| p.docids.len());
-
-        let min_freq = cmp::min(left_freq, right_freq);
-        if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
-            best = Some((min_freq, left, right));
-        }
-    }
-
-    Ok(best.map(|(_, l, r)| (l, r)))
-}
-
-fn construct_automatons(
-    reader: &heed::RoTxn<MainT>,
-    query: &str,
-    main_store: store::Main,
-    postings_lists_store: store::PostingsLists,
-    synonym_store: store::Synonyms,
-) -> MResult<(Vec<QueryWordAutomaton>, QueryEnhancer)> {
-    let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
-    let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
-    let synonyms = match main_store.synonyms_fst(reader)? {
-        Some(synonym) => synonym,
-        None => fst::Set::default(),
-    };
-
-    let mut automaton_index = 0;
-    let mut automatons = Vec::new();
-    let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
-
-    // We must not declare the original words to the query enhancer
-    // *but* we need to push them in the automatons list first
-    let mut original_words = query_words.iter().peekable();
-    while let Some(word) = original_words.next() {
-        let has_following_word = original_words.peek().is_some();
-        let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
-
-        let automaton = if not_prefix_dfa {
-            QueryWordAutomaton::exact(word)
-        } else {
-            QueryWordAutomaton::exact_prefix(word)
-        };
-        automaton_index += 1;
-        automatons.push(automaton);
-    }
-
-    for n in 1..=NGRAMS {
-        let mut ngrams = query_words.windows(n).enumerate().peekable();
-        while let Some((query_index, ngram_slice)) = ngrams.next() {
-            let query_range = query_index..query_index + n;
-            let ngram_nb_words = ngram_slice.len();
-            let ngram = ngram_slice.join(" ");
-
-            let has_following_word = ngrams.peek().is_some();
-            let not_prefix_dfa =
-                has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
-
-            // automaton of synonyms of the ngrams
-            let normalized = normalize_str(&ngram);
-            let lev = if not_prefix_dfa {
-                build_dfa(&normalized)
-            } else {
-                build_prefix_dfa(&normalized)
-            };
-
-            let mut stream = synonyms.search(&lev).into_stream();
-            while let Some(base) = stream.next() {
-                // only trigger alternatives when the last word has been typed
-                // i.e. "new " do not but "new yo" triggers alternatives to "new york"
-                let base = std::str::from_utf8(base).unwrap();
-                let base_nb_words = split_query_string(base).count();
-                if ngram_nb_words != base_nb_words {
-                    continue;
-                }
-
-                if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
-                    let mut stream = synonyms.into_stream();
-                    while let Some(synonyms) = stream.next() {
-                        let synonyms = std::str::from_utf8(synonyms).unwrap();
-                        let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
-                        let nb_synonym_words = synonyms_words.len();
-
-                        let real_query_index = automaton_index;
-                        enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
-
-                        for synonym in synonyms_words {
-                            let automaton = if nb_synonym_words == 1 {
-                                QueryWordAutomaton::exact(synonym)
-                            } else {
-                                QueryWordAutomaton::non_exact(synonym)
-                            };
-                            automaton_index += 1;
-                            automatons.push(automaton);
-                        }
-                    }
-                }
-            }
-
-            if n == 1 {
-                // automatons for splitted words
-                if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
-                    let mut left_automaton = QueryWordAutomaton::exact(left);
-                    left_automaton.phrase_query = Some((0, 2));
-                    enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
-                    automaton_index += 1;
-                    automatons.push(left_automaton);
-
-                    let mut right_automaton = QueryWordAutomaton::exact(right);
-                    right_automaton.phrase_query = Some((1, 2));
-                    enhancer_builder.declare(query_range.clone(), automaton_index, &[right]);
-                    automaton_index += 1;
-                    automatons.push(right_automaton);
-                }
-            } else {
-                // automaton of concatenation of query words
-                let concat = ngram_slice.concat();
-                let normalized = normalize_str(&concat);
-
-                let real_query_index = automaton_index;
-                enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
-
-                let automaton = QueryWordAutomaton::exact(&normalized);
-                automaton_index += 1;
-                automatons.push(automaton);
-            }
-        }
-    }
-
-    Ok((automatons, enhancer_builder.build()))
-}
diff --git a/meilisearch-core/src/criterion/mod.rs b/meilisearch-core/src/criterion/mod.rs
index 989d173e3..044a3943f 100644
--- a/meilisearch-core/src/criterion/mod.rs
+++ b/meilisearch-core/src/criterion/mod.rs
@@ -7,7 +7,7 @@ use sdset::SetBuf;
 use slice_group_by::GroupBy;
 
 use crate::automaton::QueryEnhancer;
-use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
+use crate::bucket_sort::{SimpleMatch, PostingsListView};
 use crate::database::MainT;
 use crate::query_tree::QueryId;
 use crate::{store, RawDocument, MResult};
diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs
index 6c0ac5be8..a2722488a 100644
--- a/meilisearch-core/src/lib.rs
+++ b/meilisearch-core/src/lib.rs
@@ -32,7 +32,7 @@ pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
 pub use query_words_mapper::QueryWordsMapper;
 
 use compact_arena::SmallArena;
-use crate::bucket_sort::{QueryWordAutomaton, PostingsListView};
+use crate::bucket_sort::PostingsListView;
 use crate::levenshtein::prefix_damerau_levenshtein;
 use crate::reordered_attrs::ReorderedAttrs;
 
@@ -47,7 +47,6 @@ pub struct Document {
 
 fn highlights_from_raw_document<'a, 'tag, 'txn>(
     raw_document: &RawDocument<'a, 'tag>,
-    automatons: &[QueryWordAutomaton],
     arena: &SmallArena<'tag, PostingsListView<'txn>>,
     searchable_attrs: Option<&ReorderedAttrs>,
 ) -> Vec<Highlight>
@@ -57,14 +56,14 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>(
     for bm in raw_document.bare_matches.iter() {
         let postings_list = &arena[bm.postings_list];
         let input = postings_list.input();
-        let query = &automatons[bm.query_index as usize].query;
+        // let query = &automatons[bm.query_index as usize].query;
 
         for di in postings_list.iter() {
-            let covered_area = if query.len() > input.len() {
-                input.len()
-            } else {
-                prefix_damerau_levenshtein(query.as_bytes(), input).1
-            };
+            // let covered_area = if query.len() > input.len() {
+            //     input.len()
+            // } else {
+            //     prefix_damerau_levenshtein(query.as_bytes(), input).1
+            // };
 
             let attribute = searchable_attrs
                 .and_then(|sa| sa.reverse(di.attribute))
@@ -73,7 +72,7 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>(
             let highlight = Highlight {
                 attribute: attribute,
                 char_index: di.char_index,
-                char_length: covered_area as u16,
+                char_length: di.char_length,
             };
 
             highlights.push(highlight);
@@ -97,19 +96,15 @@ impl Document {
     #[cfg(not(test))]
     pub fn from_raw<'a, 'tag, 'txn>(
         raw_document: RawDocument<'a, 'tag>,
-        // automatons: &[QueryWordAutomaton],
         arena: &SmallArena<'tag, PostingsListView<'txn>>,
         searchable_attrs: Option<&ReorderedAttrs>,
     ) -> Document
     {
-        // let highlights = highlights_from_raw_document(
-        //     &raw_document,
-        //     automatons,
-        //     arena,
-        //     searchable_attrs,
-        // );
-
-        let highlights = Vec::new();
+        let highlights = highlights_from_raw_document(
+            &raw_document,
+            arena,
+            searchable_attrs,
+        );
 
         Document { id: raw_document.id, highlights }
     }
@@ -117,21 +112,17 @@ impl Document {
     #[cfg(test)]
     pub fn from_raw<'a, 'tag, 'txn>(
         raw_document: RawDocument<'a, 'tag>,
-        // automatons: &[QueryWordAutomaton],
         arena: &SmallArena<'tag, PostingsListView<'txn>>,
         searchable_attrs: Option<&ReorderedAttrs>,
     ) -> Document
     {
         use crate::bucket_sort::SimpleMatch;
 
-        // let highlights = highlights_from_raw_document(
-        //     &raw_document,
-        //     automatons,
-        //     arena,
-        //     searchable_attrs,
-        // );
-
-        let highlights = Vec::new();
+        let highlights = highlights_from_raw_document(
+            &raw_document,
+            arena,
+            searchable_attrs,
+        );
 
         let mut matches = Vec::new();
         for sm in raw_document.processed_matches {
diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs
index d3a1ad0ec..089eaa3af 100644
--- a/meilisearch-core/src/query_tree.rs
+++ b/meilisearch-core/src/query_tree.rs
@@ -53,7 +53,8 @@ impl Operation {
     }
 
     fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Operation {
-        Operation::Query(Query { id, prefix, kind: QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]) })
+        let kind = QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]);
+        Operation::Query(Query { id, prefix, kind })
     }
 }
 
diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs
index 56fde3e7b..17955824e 100644
--- a/meilisearch-core/src/raw_document.rs
+++ b/meilisearch-core/src/raw_document.rs
@@ -1,7 +1,7 @@
 use compact_arena::SmallArena;
 use sdset::SetBuf;
 use crate::DocIndex;
-use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};
+use crate::bucket_sort::{SimpleMatch, BareMatch, PostingsListView};
 use crate::reordered_attrs::ReorderedAttrs;
 
 pub struct RawDocument<'a, 'tag> {