From 89df496f0cd18a8322e7a7f9113536f3684fb3f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 15 Jul 2019 19:34:53 +0200 Subject: [PATCH] feat: Separate highlights from matches to make the code easier to follow --- meilidb-core/src/lib.rs | 2 +- meilidb-core/src/query_builder.rs | 39 ++++++++++++++++++------------- meilidb-core/src/raw_document.rs | 31 +++++++++++++++--------- 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index b1a682e40..6f6e46359 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -13,7 +13,7 @@ pub mod criterion; use serde::{Serialize, Deserialize}; use zerocopy::{AsBytes, FromBytes}; -use self::raw_document::raw_documents_from_matches; +use self::raw_document::raw_documents_from; pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str}; pub use self::raw_document::RawDocument; diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 1fb778094..c5a0ac847 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -17,7 +17,7 @@ use crate::automaton::{build_dfa, build_prefix_dfa}; use crate::criterion::Criteria; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::query_enhancer::{QueryEnhancerBuilder, QueryEnhancer}; -use crate::raw_documents_from_matches; +use crate::raw_documents_from; use crate::reordered_attrs::ReorderedAttrs; use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document}; @@ -215,6 +215,7 @@ where S: Store, }; let mut matches = Vec::new(); + let mut highlights = Vec::new(); while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { @@ -240,23 +241,21 @@ where S: Store, is_exact, }; - // TODO do not store in the same matches vec let highlight = Highlight { attribute: di.attribute, char_index: di.char_index, char_length: di.char_length, }; - matches.push((di.document_id, match_, highlight)); + matches.push((di.document_id, match_)); + highlights.push((di.document_id, highlight)); } } } } // we sort the matches to make them rewritable - matches.par_sort_unstable_by_key(|(id, match_, _)| { - (*id, match_.attribute, match_.word_index) // query_id ??? - }); + matches.par_sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); let mut padded_matches = Vec::with_capacity(matches.len()); for same_document in matches.linear_group_by(|a, b| a.0 == b.0) { @@ -268,7 +267,7 @@ where S: Store, while let Some(same_word_index) = iter.next() { let mut biggest = 0; - for (id, match_, highlight) in same_word_index { + for (id, match_) in same_word_index { let mut replacement = query_enhancer.replacement(match_.query_index); let replacement_len = replacement.len() - 1; @@ -280,7 +279,7 @@ where S: Store, word_index: match_.word_index + padding as u16, ..match_.clone() }; - padded_matches.push((*id, match_, *highlight)); + padded_matches.push((*id, match_)); } let mut found = false; @@ -296,7 +295,7 @@ where S: Store, ..match_.clone() }; - for (_, nmatch_, _) in next_group { + for (_, nmatch_) in next_group { let mut rep = query_enhancer.replacement(nmatch_.query_index); let query_index = rep.next().unwrap(); let nmatch_ = TmpMatch { query_index, ..nmatch_.clone() }; @@ -311,12 +310,12 @@ where S: Store, word_index: match_.word_index + padding as u16 + (i + 1) as u16, ..match_.clone() }; - padded_matches.push((*id, match_, *highlight)); + padded_matches.push((*id, match_)); biggest = biggest.max(i + 1); } } - padded_matches.push((*id, padmatch_, *highlight)); + padded_matches.push((*id, padmatch_)); found = true; continue 'padding; } @@ -337,7 +336,7 @@ where S: Store, word_index: match_.word_index + padding as u16 + (i + 1) as u16, ..match_.clone() }; - padded_matches.push((*id, match_, *highlight)); + padded_matches.push((*id, match_)); } biggest = biggest.max(replacement_len); @@ -350,11 +349,19 @@ where S: Store, } - let total_matches = padded_matches.len(); - padded_matches.par_sort_unstable(); - let padded_matches = SetBuf::new_unchecked(padded_matches); - let raw_documents = raw_documents_from_matches(padded_matches); + let matches = { + padded_matches.par_sort_unstable(); + SetBuf::new_unchecked(padded_matches) + }; + + let highlights = { + highlights.par_sort_unstable_by_key(|(id, _)| *id); + SetBuf::new_unchecked(highlights) + }; + + let total_matches = matches.len(); + let raw_documents = raw_documents_from(matches, highlights); info!("{} total documents to classify", raw_documents.len()); info!("{} total matches to classify", total_matches); diff --git a/meilidb-core/src/raw_document.rs b/meilidb-core/src/raw_document.rs index 7a293439e..5d449a74a 100644 --- a/meilidb-core/src/raw_document.rs +++ b/meilidb-core/src/raw_document.rs @@ -66,25 +66,34 @@ impl fmt::Debug for RawDocument { } } -pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, TmpMatch, Highlight)>) -> Vec { +pub fn raw_documents_from( + matches: SetBuf<(DocumentId, TmpMatch)>, + highlights: SetBuf<(DocumentId, Highlight)>, +) -> Vec +{ let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); let mut matches2 = Matches::with_capacity(matches.len()); - for group in matches.linear_group_by(|(a, _, _), (b, _, _)| a == b) { - let document_id = group[0].0; - let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); - let end = start + group.len(); + let matches = matches.linear_group_by(|(a, _), (b, _)| a == b); + let highlights = highlights.linear_group_by(|(a, _), (b, _)| a == b); - let highlights = group.iter().map(|(_, _, h)| *h).collect(); + for (mgroup, hgroup) in matches.zip(highlights) { + debug_assert_eq!(mgroup[0].0, hgroup[0].0); + + let document_id = mgroup[0].0; + let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); + let end = start + mgroup.len(); + + let highlights = hgroup.iter().map(|(_, h)| *h).collect(); docs_ranges.push((document_id, Range { start, end }, highlights)); - matches2.extend_from_slice(group); + matches2.extend_from_slice(mgroup); } let matches = Arc::new(matches2); - docs_ranges.into_iter().map(|(i, range, highlights)| { + docs_ranges.into_iter().map(|(id, range, highlights)| { let matches = SharedMatches { range, matches: matches.clone() }; - RawDocument::new(i, matches, highlights) + RawDocument::new(id, matches, highlights) }).collect() } @@ -120,8 +129,8 @@ impl Matches { } } - fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch, Highlight)]) { - for (_, match_, _) in matches { + fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) { + for (_, match_) in matches { self.query_index.push(match_.query_index); self.distance.push(match_.distance); self.attribute.push(match_.attribute);