From 6b6db2f8e62e4bc47b241a445c2e0daf598f715a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <renault.cle@gmail.com>
Date: Tue, 25 Jun 2019 12:27:15 +0200
Subject: [PATCH] feat: Introduce the Highlight type to simplify the data
 oriented design

---
 meilidb-core/src/lib.rs           | 145 +++++++-----------------------
 meilidb-core/src/query_builder.rs |  30 ++++---
 2 files changed, 52 insertions(+), 123 deletions(-)
diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs
index 9c1c1c88d..f764d544e 100644
--- a/meilidb-core/src/lib.rs
+++ b/meilidb-core/src/lib.rs
@@ -60,97 +60,43 @@ pub struct DocIndex {
 ///
 /// The order of the field is important because it defines
 /// the way these structures are ordered between themselves.
-///
-/// The word in itself is not important.
-// TODO do data oriented programming ? very arrays ?
 #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct Match {
-    /// The word index in the query sentence.
-    /// Same as the `attribute_index` but for the query words.
-    ///
-    /// Used to retrieve the automaton that match this word.
-    pub query_index: u32,
-
-    /// The distance the word has with the query word
-    /// (i.e. the Levenshtein distance).
-    pub distance: u8,
-
+pub struct Highlight {
     /// The attribute in the document where the word was found
     /// along with the index in it.
     pub attribute: u16,
-    pub word_index: u16,
 
-    /// Whether the word that match is an exact match or a prefix.
-    pub is_exact: bool,
-
-    /// The position in bytes where the word was found
-    /// along with the length of it.
+    /// The position in bytes where the word was found.
     ///
     /// It informs on the original word area in the text indexed
     /// without needing to run the tokenizer again.
     pub char_index: u16,
+
+    /// The length in bytes of the found word.
+    ///
+    /// It informs on the original word area in the text indexed
+    /// without needing to run the tokenizer again.
     pub char_length: u16,
 }
 
-impl Match {
-    pub fn zero() -> Self {
-        Match {
-            query_index: 0,
-            distance: 0,
-            attribute: 0,
-            word_index: 0,
-            is_exact: false,
-            char_index: 0,
-            char_length: 0,
-        }
-    }
-
-    pub fn max() -> Self {
-        Match {
-            query_index: u32::max_value(),
-            distance: u8::max_value(),
-            attribute: u16::max_value(),
-            word_index: u16::max_value(),
-            is_exact: true,
-            char_index: u16::max_value(),
-            char_length: u16::max_value(),
-        }
-    }
+#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
+struct TmpMatch {
+    pub query_index: u32,
+    pub distance: u8,
+    pub attribute: u16,
+    pub word_index: u16,
+    pub is_exact: bool,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct Document {
     pub id: DocumentId,
-    pub matches: Vec<Match>,
+    pub highlights: Vec<Highlight>,
 }
 
 impl Document {
-    fn from_raw(raw: &RawDocument) -> Document {
-        let len = raw.matches.range.len();
-        let mut matches = Vec::with_capacity(len);
-
-        let query_index = raw.query_index();
-        let distance = raw.distance();
-        let attribute = raw.attribute();
-        let word_index = raw.word_index();
-        let is_exact = raw.is_exact();
-        let char_index = raw.char_index();
-        let char_length = raw.char_length();
-
-        for i in 0..len {
-            let match_ = Match {
-                query_index: query_index[i],
-                distance: distance[i],
-                attribute: attribute[i],
-                word_index: word_index[i],
-                is_exact: is_exact[i],
-                char_index: char_index[i],
-                char_length: char_length[i],
-            };
-            matches.push(match_);
-        }
-
-        Document { id: raw.id, matches }
+    fn from_raw(raw: RawDocument) -> Document {
+        Document { id: raw.id, highlights: raw.highlights }
     }
 }
 
@@ -158,11 +104,12 @@ impl Document {
 pub struct RawDocument {
     pub id: DocumentId,
     pub matches: SharedMatches,
+    pub highlights: Vec<Highlight>,
 }
 
 impl RawDocument {
-    fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
-        RawDocument { id, matches: SharedMatches { range, matches } }
+    fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
+        RawDocument { id, matches, highlights }
     }
 
     pub fn query_index(&self) -> &[u32] {
@@ -199,20 +146,6 @@ impl RawDocument {
         // can only be done in this module
         unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
     }
-
-    pub fn char_index(&self) -> &[u16] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
-    }
-
-    pub fn char_length(&self) -> &[u16] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
-    }
 }
 
 impl fmt::Debug for RawDocument {
@@ -224,27 +157,30 @@ impl fmt::Debug for RawDocument {
             .field("attribute", &self.attribute())
             .field("word_index", &self.word_index())
             .field("is_exact", &self.is_exact())
-            .field("char_index", &self.char_index())
-            .field("char_length", &self.char_length())
             .finish()
     }
 }
 
-pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, Match)>) -> Vec<RawDocument> {
-    let mut docs_ranges = Vec::<(_, Range)>::new();
+fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec<RawDocument> {
+    let mut docs_ranges = Vec::<(DocumentId, Range, Vec<Highlight>)>::new();
     let mut matches2 = Matches::with_capacity(matches.len());
 
-    for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
-        let id = group[0].0;
-        let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
+    for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) {
+        let document_id = group[0].0;
+        let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
         let end = start + group.len();
-        docs_ranges.push((id, Range { start, end }));
+
+        let highlights = group.iter().map(|(_, _, h)| *h).collect();
+        docs_ranges.push((document_id, Range { start, end }, highlights));
 
         matches2.extend_from_slice(group);
     }
 
     let matches = Arc::new(matches2);
-    docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
+    docs_ranges.into_iter().map(|(i, range, highlights)| {
+        let matches = SharedMatches { range, matches: matches.clone() };
+        RawDocument::new(i, matches, highlights)
+    }).collect()
 }
 
 #[derive(Debug, Copy, Clone)]
@@ -253,12 +189,6 @@ struct Range {
     end: usize,
 }
 
-impl Range {
-    fn len(self) -> usize {
-        self.end - self.start
-    }
-}
-
 #[derive(Clone)]
 pub struct SharedMatches {
     range: Range,
@@ -272,8 +202,6 @@ struct Matches {
     attribute: Vec<u16>,
     word_index: Vec<u16>,
     is_exact: Vec<bool>,
-    char_index: Vec<u16>,
-    char_length: Vec<u16>,
 }
 
 impl Matches {
@@ -284,25 +212,20 @@ impl Matches {
             attribute: Vec::with_capacity(cap),
             word_index: Vec::with_capacity(cap),
             is_exact: Vec::with_capacity(cap),
-            char_index: Vec::with_capacity(cap),
-            char_length: Vec::with_capacity(cap),
         }
     }
 
-    fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
-        for (_, match_) in matches {
+    fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) {
+        for (_, match_, _) in matches {
             self.query_index.push(match_.query_index);
             self.distance.push(match_.distance);
             self.attribute.push(match_.attribute);
             self.word_index.push(match_.word_index);
             self.is_exact.push(match_.is_exact);
-            self.char_index.push(match_.char_index);
-            self.char_length.push(match_.char_length);
         }
     }
 }
 
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs
index 63d39c1d2..a59fcfa4a 100644
--- a/meilidb-core/src/query_builder.rs
+++ b/meilidb-core/src/query_builder.rs
@@ -5,7 +5,7 @@ use std::time::Instant;
 use std::{cmp, mem};
 
 use fst::{Streamer, IntoStreamer};
-use hashbrown::{HashMap, HashSet};
+use hashbrown::HashMap;
 use log::info;
 use meilidb_tokenizer::{is_cjk, split_query_string};
 use rayon::slice::ParallelSliceMut;
@@ -18,7 +18,7 @@ use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
 use crate::criterion::Criteria;
 use crate::raw_documents_from_matches;
 use crate::reordered_attrs::ReorderedAttrs;
-use crate::{Match, DocumentId, Store, RawDocument, Document};
+use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document};
 
 const NGRAMS: usize = 3;
 
@@ -178,12 +178,12 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
     Ok(automatons)
 }
 
-fn rewrite_matched_positions(matches: &mut [(DocumentId, Match)]) {
-    for document_matches in matches.linear_group_by_mut(|(a, _), (b, _)| a == b) {
+fn rewrite_matched_positions(matches: &mut [(DocumentId, TmpMatch, Highlight)]) {
+    for document_matches in matches.linear_group_by_mut(|(a, _, _), (b, _, _)| a == b) {
         let mut offset = 0;
-        for query_indexes in document_matches.linear_group_by_mut(|(_, a), (_, b)| a.query_index == b.query_index) {
+        for query_indexes in document_matches.linear_group_by_mut(|(_, a, _), (_, b, _)| a.query_index == b.query_index) {
             let word_index = query_indexes[0].1.word_index - offset as u16;
-            for (_, match_) in query_indexes.iter_mut() {
+            for (_, match_, _) in query_indexes.iter_mut() {
                 match_.word_index = word_index;
             }
             offset += query_indexes.len() - 1;
@@ -268,17 +268,19 @@ where S: Store,
                 for di in doc_indexes.as_slice() {
                     let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
                     if let Some(attribute) = attribute {
-                        let match_ = Match {
+                        let match_ = TmpMatch {
                             query_index: query_index as u32,
                             distance,
                             attribute,
                             word_index: di.word_index,
                             is_exact,
+                        };
+                        let highlight = Highlight {
+                            attribute: di.attribute,
                             char_index: di.char_index,
                             char_length: di.char_length,
                         };
-                        matches.push((di.document_id, match_));
-
+                        matches.push((di.document_id, match_, highlight));
                     }
                 }
             }
@@ -289,7 +291,11 @@ where S: Store,
         rewrite_matched_positions(&mut matches);
 
         let total_matches = matches.len();
-        let padded_matches = SetBuf::from_dirty(matches);
+        let padded_matches = {
+            matches.par_sort_unstable();
+            matches.dedup();
+            SetBuf::new_unchecked(matches)
+        };
         let raw_documents = raw_documents_from_matches(padded_matches);
 
         info!("{} total documents to classify", raw_documents.len());
@@ -349,7 +355,7 @@ where S: Store,
 
         let offset = cmp::min(documents.len(), range.start);
         let iter = documents.into_iter().skip(offset).take(range.len());
-        Ok(iter.map(|d| Document::from_raw(&d)).collect())
+        Ok(iter.map(|d| Document::from_raw(d)).collect())
     }
 }
 
@@ -476,7 +482,7 @@ where S: Store,
                 };
 
                 if distinct_accepted && seen.len() > range.start {
-                    out_documents.push(Document::from_raw(&document));
+                    out_documents.push(Document::from_raw(document));
                     if out_documents.len() == range.len() { break }
                 }
             }