From 6b6db2f8e62e4bc47b241a445c2e0daf598f715a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 25 Jun 2019 12:27:15 +0200 Subject: [PATCH] feat: Introduce the Highlight type to simplify the data oriented design --- meilidb-core/src/lib.rs | 145 +++++++----------------------- meilidb-core/src/query_builder.rs | 30 ++++--- 2 files changed, 52 insertions(+), 123 deletions(-) diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 9c1c1c88d..f764d544e 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -60,97 +60,43 @@ pub struct DocIndex { /// /// The order of the field is important because it defines /// the way these structures are ordered between themselves. -/// -/// The word in itself is not important. -// TODO do data oriented programming ? very arrays ? #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Match { - /// The word index in the query sentence. - /// Same as the `attribute_index` but for the query words. - /// - /// Used to retrieve the automaton that match this word. - pub query_index: u32, - - /// The distance the word has with the query word - /// (i.e. the Levenshtein distance). - pub distance: u8, - +pub struct Highlight { /// The attribute in the document where the word was found /// along with the index in it. pub attribute: u16, - pub word_index: u16, - /// Whether the word that match is an exact match or a prefix. - pub is_exact: bool, - - /// The position in bytes where the word was found - /// along with the length of it. + /// The position in bytes where the word was found. /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. pub char_index: u16, + + /// The length in bytes of the found word. + /// + /// It informs on the original word area in the text indexed + /// without needing to run the tokenizer again. pub char_length: u16, } -impl Match { - pub fn zero() -> Self { - Match { - query_index: 0, - distance: 0, - attribute: 0, - word_index: 0, - is_exact: false, - char_index: 0, - char_length: 0, - } - } - - pub fn max() -> Self { - Match { - query_index: u32::max_value(), - distance: u8::max_value(), - attribute: u16::max_value(), - word_index: u16::max_value(), - is_exact: true, - char_index: u16::max_value(), - char_length: u16::max_value(), - } - } +#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)] +struct TmpMatch { + pub query_index: u32, + pub distance: u8, + pub attribute: u16, + pub word_index: u16, + pub is_exact: bool, } #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Document { pub id: DocumentId, - pub matches: Vec, + pub highlights: Vec, } impl Document { - fn from_raw(raw: &RawDocument) -> Document { - let len = raw.matches.range.len(); - let mut matches = Vec::with_capacity(len); - - let query_index = raw.query_index(); - let distance = raw.distance(); - let attribute = raw.attribute(); - let word_index = raw.word_index(); - let is_exact = raw.is_exact(); - let char_index = raw.char_index(); - let char_length = raw.char_length(); - - for i in 0..len { - let match_ = Match { - query_index: query_index[i], - distance: distance[i], - attribute: attribute[i], - word_index: word_index[i], - is_exact: is_exact[i], - char_index: char_index[i], - char_length: char_length[i], - }; - matches.push(match_); - } - - Document { id: raw.id, matches } + fn from_raw(raw: RawDocument) -> Document { + Document { id: raw.id, highlights: raw.highlights } } } @@ -158,11 +104,12 @@ impl Document { pub struct RawDocument { pub id: DocumentId, pub matches: SharedMatches, + pub highlights: Vec, } impl RawDocument { - fn new(id: DocumentId, range: Range, matches: Arc) -> RawDocument { - RawDocument { id, matches: SharedMatches { range, matches } } + fn new(id: DocumentId, matches: SharedMatches, highlights: Vec) -> RawDocument { + RawDocument { id, matches, highlights } } pub fn query_index(&self) -> &[u32] { @@ -199,20 +146,6 @@ impl RawDocument { // can only be done in this module unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } } - - pub fn char_index(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) } - } - - pub fn char_length(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) } - } } impl fmt::Debug for RawDocument { @@ -224,27 +157,30 @@ impl fmt::Debug for RawDocument { .field("attribute", &self.attribute()) .field("word_index", &self.word_index()) .field("is_exact", &self.is_exact()) - .field("char_index", &self.char_index()) - .field("char_length", &self.char_length()) .finish() } } -pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, Match)>) -> Vec { - let mut docs_ranges = Vec::<(_, Range)>::new(); +fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec { + let mut docs_ranges = Vec::<(DocumentId, Range, Vec)>::new(); let mut matches2 = Matches::with_capacity(matches.len()); - for group in matches.linear_group_by(|(a, _), (b, _)| a == b) { - let id = group[0].0; - let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0); + for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) { + let document_id = group[0].0; + let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); let end = start + group.len(); - docs_ranges.push((id, Range { start, end })); + + let highlights = group.iter().map(|(_, _, h)| *h).collect(); + docs_ranges.push((document_id, Range { start, end }, highlights)); matches2.extend_from_slice(group); } let matches = Arc::new(matches2); - docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect() + docs_ranges.into_iter().map(|(i, range, highlights)| { + let matches = SharedMatches { range, matches: matches.clone() }; + RawDocument::new(i, matches, highlights) + }).collect() } #[derive(Debug, Copy, Clone)] @@ -253,12 +189,6 @@ struct Range { end: usize, } -impl Range { - fn len(self) -> usize { - self.end - self.start - } -} - #[derive(Clone)] pub struct SharedMatches { range: Range, @@ -272,8 +202,6 @@ struct Matches { attribute: Vec, word_index: Vec, is_exact: Vec, - char_index: Vec, - char_length: Vec, } impl Matches { @@ -284,25 +212,20 @@ impl Matches { attribute: Vec::with_capacity(cap), word_index: Vec::with_capacity(cap), is_exact: Vec::with_capacity(cap), - char_index: Vec::with_capacity(cap), - char_length: Vec::with_capacity(cap), } } - fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) { - for (_, match_) in matches { + fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) { + for (_, match_, _) in matches { self.query_index.push(match_.query_index); self.distance.push(match_.distance); self.attribute.push(match_.attribute); self.word_index.push(match_.word_index); self.is_exact.push(match_.is_exact); - self.char_index.push(match_.char_index); - self.char_length.push(match_.char_length); } } } - #[cfg(test)] mod tests { use super::*; diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 63d39c1d2..a59fcfa4a 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -5,7 +5,7 @@ use std::time::Instant; use std::{cmp, mem}; use fst::{Streamer, IntoStreamer}; -use hashbrown::{HashMap, HashSet}; +use hashbrown::HashMap; use log::info; use meilidb_tokenizer::{is_cjk, split_query_string}; use rayon::slice::ParallelSliceMut; @@ -18,7 +18,7 @@ use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; use crate::raw_documents_from_matches; use crate::reordered_attrs::ReorderedAttrs; -use crate::{Match, DocumentId, Store, RawDocument, Document}; +use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document}; const NGRAMS: usize = 3; @@ -178,12 +178,12 @@ fn generate_automatons(query: &str, store: &S) -> Result range.start { - out_documents.push(Document::from_raw(&document)); + out_documents.push(Document::from_raw(document)); if out_documents.len() == range.len() { break } } }