diff --git a/Cargo.toml b/Cargo.toml index 5e7bba1fb..572cbf2aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ serde_derive = "1.0" serde_json = { version = "1.0", features = ["preserve_order"] } slice-group-by = "0.2" unidecode = "0.3" +rayon = "1.0" [dependencies.toml] git = "https://github.com/Kerollmops/toml-rs.git" diff --git a/examples/query-database.rs b/examples/query-database.rs index 0a8771a51..d1e6a0e17 100644 --- a/examples/query-database.rs +++ b/examples/query-database.rs @@ -70,12 +70,10 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) let mut byte_indexes = BTreeMap::new(); for match_ in matches { - let match_attribute = match_.attribute.attribute(); + let match_attribute = match_.attribute; if SchemaAttr::new(match_attribute) == attribute { - let word_area = match_.word_area; - - let char_index = word_area.char_index() as usize; - let char_length = word_area.length() as usize; + let char_index = match_.char_index as usize; + let char_length = match_.char_length as usize; let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text); match byte_indexes.entry(byte_index) { @@ -151,7 +149,7 @@ fn main() -> Result<(), Box> { let mut matching_attributes = HashSet::new(); for _match in doc.matches { - let attr = SchemaAttr::new(_match.attribute.attribute()); + let attr = SchemaAttr::new(_match.attribute); let name = schema.attribute_name(attr); matching_attributes.insert(name); } diff --git a/src/attribute.rs b/src/attribute.rs deleted file mode 100644 index 4c075e475..000000000 --- a/src/attribute.rs +++ /dev/null @@ -1,105 +0,0 @@ -use std::fmt; - -/// Represent an attribute number along with the word index -/// according to the tokenizer used. -/// -/// It can accept up to 1024 attributes and word positions -/// can be maximum 2^22. -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Attribute(u32); - -impl Attribute { - /// Construct an `Attribute` from an attribute number and - /// the word position of a match according to the tokenizer used. - pub(crate) fn new(attribute: u16, index: u32) -> Result { - if attribute & 0b1111_1100_0000_0000 != 0 { - return Err(AttributeError::AttributeTooBig) - } - - if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { - return Err(AttributeError::IndexTooBig) - } - - let attribute = u32::from(attribute) << 22; - Ok(Attribute(attribute | index)) - } - - /// Construct an `Attribute` from an attribute number and - /// the word position of a match according to the tokenizer used. - /// - /// # Panics - /// - /// The attribute must not be greater than 1024 - /// and the word index not greater than 2^22. - pub(crate) fn new_faillible(attribute: u16, index: u32) -> Attribute { - match Attribute::new(attribute, index) { - Ok(attribute) => attribute, - Err(AttributeError::AttributeTooBig) => { - panic!("attribute must not be greater than 1024") - }, - Err(AttributeError::IndexTooBig) => { - panic!("attribute word index must not be greater than 2^22") - }, - } - } - - pub(crate) fn max_value() -> Attribute { - Attribute(u32::max_value()) - } - - #[inline] - pub fn attribute(self) -> u16 { - (self.0 >> 22) as u16 - } - - #[inline] - pub fn word_index(self) -> u32 { - self.0 & 0b0000_0000_0011_1111_1111_1111_1111 - } -} - -impl fmt::Debug for Attribute { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("Attribute") - .field("attribute", &self.attribute()) - .field("word_index", &self.word_index()) - .finish() - } -} - -pub enum AttributeError { - AttributeTooBig, - IndexTooBig, -} - -#[cfg(test)] -mod tests { - use super::*; - use quickcheck::{quickcheck, TestResult}; - - quickcheck! { - fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult { - if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) { - return TestResult::discard() - } - - let attribute = Attribute::new_faillible(gen_attr, gen_index); - - let valid_attribute = attribute.attribute() == gen_attr; - let valid_index = attribute.word_index() == gen_index; - - TestResult::from_bool(valid_attribute && valid_index) - } - - fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult { - if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) { - return TestResult::discard() - } - - let a = Attribute::new_faillible(gen_attr, gen_index); - let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1); - - TestResult::from_bool(a < b) - } - } -} diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs index b760765bf..67106a948 100644 --- a/src/data/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -147,29 +147,32 @@ impl DocIndexesBuilder { #[cfg(test)] mod tests { - use super::*; - use std::error::Error; - use crate::{Attribute, WordArea}; - use crate::DocumentId; + use super::*; #[test] fn builder_serialize_deserialize() -> Result<(), Box> { let a = DocIndex { document_id: DocumentId(0), - attribute: Attribute::new_faillible(3, 11), - word_area: WordArea::new_faillible(30, 4) + attribute: 3, + word_index: 11, + char_index: 30, + char_length: 4, }; let b = DocIndex { document_id: DocumentId(1), - attribute: Attribute::new_faillible(4, 21), - word_area: WordArea::new_faillible(35, 6) + attribute: 4, + word_index: 21, + char_index: 35, + char_length: 6, }; let c = DocIndex { document_id: DocumentId(2), - attribute: Attribute::new_faillible(8, 2), - word_area: WordArea::new_faillible(89, 6) + attribute: 8, + word_index: 2, + char_index: 89, + char_length: 6, }; let mut builder = DocIndexesBuilder::memory(); @@ -193,18 +196,24 @@ mod tests { fn serialize_deserialize() -> Result<(), Box> { let a = DocIndex { document_id: DocumentId(0), - attribute: Attribute::new_faillible(3, 11), - word_area: WordArea::new_faillible(30, 4) + attribute: 3, + word_index: 11, + char_index: 30, + char_length: 4, }; let b = DocIndex { document_id: DocumentId(1), - attribute: Attribute::new_faillible(4, 21), - word_area: WordArea::new_faillible(35, 6) + attribute: 4, + word_index: 21, + char_index: 35, + char_length: 6, }; let c = DocIndex { document_id: DocumentId(2), - attribute: Attribute::new_faillible(8, 2), - word_area: WordArea::new_faillible(89, 6) + attribute: 8, + word_index: 2, + char_index: 89, + char_length: 6, }; let mut builder = DocIndexesBuilder::memory(); diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs index bdbfb281d..63bb016d8 100644 --- a/src/database/serde/indexer_serializer.rs +++ b/src/database/serde/indexer_serializer.rs @@ -3,7 +3,7 @@ use crate::database::serde::SerializerError; use crate::database::schema::SchemaAttr; use crate::tokenizer::TokenizerBuilder; use crate::tokenizer::Token; -use crate::{DocumentId, DocIndex, Attribute, WordArea}; +use crate::{DocumentId, DocIndex}; use hashbrown::HashSet; use serde::Serialize; @@ -54,10 +54,8 @@ where B: TokenizerBuilder let document_id = self.document_id; // FIXME must u32::try_from instead - let attribute = match Attribute::new(self.attribute.0, word_index as u32) { - Ok(attribute) => attribute, - Err(_) => return Ok(()), - }; + let attribute = self.attribute.0; + let word_index = word_index as u32; // insert the exact representation let word_lower = word.to_lowercase(); @@ -68,21 +66,17 @@ where B: TokenizerBuilder // and the unidecoded lowercased version let word_unidecoded = unidecode::unidecode(word).to_lowercase(); if word_lower != word_unidecoded { - let word_area = match WordArea::new(char_index as u32, length) { - Ok(word_area) => word_area, - Err(_) => return Ok(()), - }; + let char_index = char_index as u32; + let char_length = length; - let doc_index = DocIndex { document_id, attribute, word_area }; + let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index); } - let word_area = match WordArea::new(char_index as u32, length) { - Ok(word_area) => word_area, - Err(_) => return Ok(()), - }; + let char_index = char_index as u32; + let char_length = length; - let doc_index = DocIndex { document_id, attribute, word_area }; + let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; self.update.insert_doc_index(word_lower.into_bytes(), doc_index); } Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 19e451f63..bfa0b3cd9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,16 +5,12 @@ pub mod database; pub mod data; pub mod rank; pub mod tokenizer; -mod attribute; -mod word_area; mod common_words; pub use rocksdb; pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; -pub use self::attribute::{Attribute, AttributeError}; -pub use self::word_area::{WordArea, WordAreaError}; /// Represent an internally generated document unique identifier. /// @@ -36,14 +32,16 @@ pub struct DocIndex { /// The attribute in the document where the word was found /// along with the index in it. - pub attribute: Attribute, + pub attribute: u16, + pub word_index: u32, /// The position in bytes where the word was found /// along with the length of it. /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. - pub word_area: WordArea, + pub char_index: u32, + pub char_length: u16, } /// This structure represent a matching word with informations @@ -68,7 +66,8 @@ pub struct Match { /// The attribute in the document where the word was found /// along with the index in it. - pub attribute: Attribute, + pub attribute: u16, + pub word_index: u32, /// Whether the word that match is an exact match or a prefix. pub is_exact: bool, @@ -78,7 +77,8 @@ pub struct Match { /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. - pub word_area: WordArea, + pub char_index: u32, + pub char_length: u16, } impl Match { @@ -86,9 +86,11 @@ impl Match { Match { query_index: 0, distance: 0, - attribute: Attribute::new_faillible(0, 0), + attribute: 0, + word_index: 0, is_exact: false, - word_area: WordArea::new_faillible(0, 0), + char_index: 0, + char_length: 0, } } @@ -96,9 +98,11 @@ impl Match { Match { query_index: u32::max_value(), distance: u8::max_value(), - attribute: Attribute::max_value(), + attribute: u16::max_value(), + word_index: u32::max_value(), is_exact: true, - word_area: WordArea::max_value(), + char_index: u32::max_value(), + char_length: u16::max_value(), } } } @@ -110,6 +114,6 @@ mod tests { #[test] fn docindex_mem_size() { - assert_eq!(mem::size_of::(), 16); + assert_eq!(mem::size_of::(), 24); } } diff --git a/src/rank/criterion/document_id.rs b/src/rank/criterion/document_id.rs index 2d8ca34c2..8e4cf91b5 100644 --- a/src/rank/criterion/document_id.rs +++ b/src/rank/criterion/document_id.rs @@ -1,19 +1,13 @@ use std::cmp::Ordering; -use std::ops::Deref; - -use rocksdb::DB; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::rank::Document; +use crate::rank::RawDocument; #[derive(Debug, Clone, Copy)] pub struct DocumentId; -impl Criterion for DocumentId -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { +impl Criterion for DocumentId { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { lhs.id.cmp(&rhs.id) } } diff --git a/src/rank/criterion/exact.rs b/src/rank/criterion/exact.rs index 574649ed6..54b5b7b9f 100644 --- a/src/rank/criterion/exact.rs +++ b/src/rank/criterion/exact.rs @@ -1,33 +1,40 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn contains_exact(matches: &&[Match]) -> bool { - matches.iter().any(|m| m.is_exact) -} +fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { + let mut count = 0; + let mut index = 0; -#[inline] -fn number_exact_matches(matches: &[Match]) -> usize { - matches.linear_group_by(match_query_index).filter(contains_exact).count() + for group in query_index.linear_group_by(PartialEq::eq) { + let len = group.len(); + count += is_exact[index..index + len].contains(&true) as usize; + index += len; + } + + count } #[derive(Debug, Clone, Copy)] pub struct Exact; -impl Criterion for Exact -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = number_exact_matches(&lhs.matches); - let rhs = number_exact_matches(&rhs.matches); +impl Criterion for Exact { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let is_exact = lhs.is_exact(); + number_exact_matches(query_index, is_exact) + }; + + let rhs = { + let query_index = rhs.query_index(); + let is_exact = rhs.is_exact(); + number_exact_matches(query_index, is_exact) + }; lhs.cmp(&rhs).reverse() } diff --git a/src/rank/criterion/mod.rs b/src/rank/criterion/mod.rs index a5dc7ab26..07c6a37e1 100644 --- a/src/rank/criterion/mod.rs +++ b/src/rank/criterion/mod.rs @@ -8,12 +8,7 @@ mod sort_by; mod document_id; use std::cmp::Ordering; -use std::ops::Deref; - -use rocksdb::DB; - -use crate::database::DatabaseView; -use crate::rank::Document; +use crate::rank::RawDocument; pub use self::{ sum_of_typos::SumOfTypos, @@ -26,56 +21,47 @@ pub use self::{ document_id::DocumentId, }; -pub trait Criterion -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> Ordering; +pub trait Criterion: Send + Sync { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; #[inline] - fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> bool { - self.evaluate(lhs, rhs, view) == Ordering::Equal + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { + self.evaluate(lhs, rhs) == Ordering::Equal } } -impl<'a, D, T: Criterion + ?Sized> Criterion for &'a T -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> Ordering { - (**self).evaluate(lhs, rhs, view) +impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + (**self).evaluate(lhs, rhs) } - fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> bool { - (**self).eq(lhs, rhs, view) + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { + (**self).eq(lhs, rhs) } } -impl + ?Sized> Criterion for Box -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> Ordering { - (**self).evaluate(lhs, rhs, view) +impl Criterion for Box { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + (**self).evaluate(lhs, rhs) } - fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> bool { - (**self).eq(lhs, rhs, view) + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { + (**self).eq(lhs, rhs) } } #[derive(Default)] -pub struct CriteriaBuilder -where D: Deref -{ - inner: Vec>> +pub struct CriteriaBuilder { + inner: Vec> } -impl CriteriaBuilder -where D: Deref +impl CriteriaBuilder { - pub fn new() -> CriteriaBuilder { + pub fn new() -> CriteriaBuilder { CriteriaBuilder { inner: Vec::new() } } - pub fn with_capacity(capacity: usize) -> CriteriaBuilder { + pub fn with_capacity(capacity: usize) -> CriteriaBuilder { CriteriaBuilder { inner: Vec::with_capacity(capacity) } } @@ -83,33 +69,29 @@ where D: Deref self.inner.reserve(additional) } - pub fn add(mut self, criterion: C) -> CriteriaBuilder - where C: 'static + Criterion, + pub fn add(mut self, criterion: C) -> CriteriaBuilder + where C: 'static + Criterion, { self.push(criterion); self } pub fn push(&mut self, criterion: C) - where C: 'static + Criterion, + where C: 'static + Criterion, { self.inner.push(Box::new(criterion)); } - pub fn build(self) -> Criteria { + pub fn build(self) -> Criteria { Criteria { inner: self.inner } } } -pub struct Criteria -where D: Deref -{ - inner: Vec>>, +pub struct Criteria { + inner: Vec>, } -impl Default for Criteria -where D: Deref -{ +impl Default for Criteria { fn default() -> Self { CriteriaBuilder::with_capacity(7) .add(SumOfTypos) @@ -123,10 +105,8 @@ where D: Deref } } -impl AsRef<[Box>]> for Criteria -where D: Deref -{ - fn as_ref(&self) -> &[Box>] { +impl AsRef<[Box]> for Criteria { + fn as_ref(&self) -> &[Box] { &self.inner } } diff --git a/src/rank/criterion/number_of_words.rs b/src/rank/criterion/number_of_words.rs index ac9ef9858..c8dd1edb4 100644 --- a/src/rank/criterion/number_of_words.rs +++ b/src/rank/criterion/number_of_words.rs @@ -1,28 +1,28 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn number_of_query_words(matches: &[Match]) -> usize { - matches.linear_group_by(match_query_index).count() +fn number_of_query_words(query_index: &[u32]) -> usize { + query_index.linear_group_by(PartialEq::eq).count() } #[derive(Debug, Clone, Copy)] pub struct NumberOfWords; -impl Criterion for NumberOfWords -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = number_of_query_words(&lhs.matches); - let rhs = number_of_query_words(&rhs.matches); +impl Criterion for NumberOfWords { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + number_of_query_words(query_index) + }; + let rhs = { + let query_index = rhs.query_index(); + number_of_query_words(query_index) + }; lhs.cmp(&rhs).reverse() } diff --git a/src/rank/criterion/sort_by.rs b/src/rank/criterion/sort_by.rs index 8f1fef11c..d1c7abf8c 100644 --- a/src/rank/criterion/sort_by.rs +++ b/src/rank/criterion/sort_by.rs @@ -7,7 +7,7 @@ use serde::de::DeserializeOwned; use crate::rank::criterion::Criterion; use crate::database::DatabaseView; -use crate::rank::Document; +use crate::rank::RawDocument; /// An helper struct that permit to sort documents by /// some of their stored attributes. @@ -24,7 +24,7 @@ use crate::rank::Document; /// /// # Example /// -/// ```no-test +/// ```ignore /// use serde_derive::Deserialize; /// use meilidb::rank::criterion::*; /// @@ -40,39 +40,40 @@ use crate::rank::Document; /// .add(SumOfWordsAttribute) /// .add(SumOfWordsPosition) /// .add(Exact) -/// .add(SortBy::::new()) +/// .add(SortBy::::new(&view)) /// .add(DocumentId); /// /// let criterion = builder.build(); /// /// ``` -pub struct SortBy { +pub struct SortBy<'a, T, D> +where D: Deref + Send + Sync, + T: Send + Sync +{ + view: &'a DatabaseView, _phantom: marker::PhantomData, } -impl SortBy { - pub fn new() -> Self { - SortBy::default() - } -} - -impl Default for SortBy { - fn default() -> SortBy { - SortBy { _phantom: marker::PhantomData } - } -} - -impl Criterion for SortBy -where D: Deref, - T: DeserializeOwned + Ord, +impl<'a, T, D> SortBy<'a, T, D> +where D: Deref + Send + Sync, + T: Send + Sync { - fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> Ordering { - let lhs = match view.document_by_id::(lhs.id) { + pub fn new(view: &'a DatabaseView) -> Self { + SortBy { view, _phantom: marker::PhantomData } + } +} + +impl<'a, T, D> Criterion for SortBy<'a, T, D> +where D: Deref + Send + Sync, + T: DeserializeOwned + Ord + Send + Sync, +{ + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = match self.view.document_by_id::(lhs.id) { Ok(doc) => Some(doc), Err(e) => { eprintln!("{}", e); None }, }; - let rhs = match view.document_by_id::(rhs.id) { + let rhs = match self.view.document_by_id::(rhs.id) { Ok(doc) => Some(doc), Err(e) => { eprintln!("{}", e); None }, }; diff --git a/src/rank/criterion/sum_of_typos.rs b/src/rank/criterion/sum_of_typos.rs index be742e787..5d98a42e7 100644 --- a/src/rank/criterion/sum_of_typos.rs +++ b/src/rank/criterion/sum_of_typos.rs @@ -1,24 +1,20 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn sum_matches_typos(matches: &[Match]) -> isize { +fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> isize { let mut sum_typos = 0; let mut number_words = 0; + let mut index = 0; - // note that GroupBy will never return an empty group - // so we can do this assumption safely - for group in matches.linear_group_by(match_query_index) { - sum_typos += unsafe { group.get_unchecked(0).distance as isize }; + for group in query_index.linear_group_by(PartialEq::eq) { + sum_typos += distance[index] as isize; number_words += 1; + index += group.len(); } sum_typos - number_words @@ -27,78 +23,42 @@ fn sum_matches_typos(matches: &[Match]) -> isize { #[derive(Debug, Clone, Copy)] pub struct SumOfTypos; -impl Criterion for SumOfTypos -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = sum_matches_typos(&lhs.matches); - let rhs = sum_matches_typos(&rhs.matches); +impl Criterion for SumOfTypos { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let distance = lhs.distance(); + sum_matches_typos(query_index, distance) + }; + + let rhs = { + let query_index = rhs.query_index(); + let distance = rhs.distance(); + sum_matches_typos(query_index, distance) + }; lhs.cmp(&rhs) } } - #[cfg(test)] mod tests { use super::*; - use crate::{DocumentId, Attribute, WordArea}; - // typing: "Geox CEO" // // doc0: "Geox SpA: CEO and Executive" // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" #[test] fn one_typo_reference() { - let doc0 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 0, - attribute: Attribute::new_faillible(0, 2), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(0), - matches: matches, - } - }; + let query_index0 = &[0, 1]; + let distance0 = &[0, 0]; - let doc1 = { - let matches = vec![ - Match { - query_index: 0, - distance: 1, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 0, - attribute: Attribute::new_faillible(0, 2), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(1), - matches: matches, - } - }; + let query_index1 = &[0, 1]; + let distance1 = &[1, 0]; - let lhs = sum_matches_typos(&doc0.matches); - let rhs = sum_matches_typos(&doc1.matches); + let lhs = sum_matches_typos(query_index0, distance0); + let rhs = sum_matches_typos(query_index1, distance1); assert_eq!(lhs.cmp(&rhs), Ordering::Less); } @@ -108,47 +68,14 @@ mod tests { // doc1: "bouton" #[test] fn no_typo() { - let doc0 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 0, - attribute: Attribute::new_faillible(0, 1), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(0), - matches: matches, - } - }; + let query_index0 = &[0, 1]; + let distance0 = &[0, 0]; - let doc1 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(1), - matches: matches, - } - }; + let query_index1 = &[0]; + let distance1 = &[0]; - let lhs = sum_matches_typos(&doc0.matches); - let rhs = sum_matches_typos(&doc1.matches); + let lhs = sum_matches_typos(query_index0, distance0); + let rhs = sum_matches_typos(query_index1, distance1); assert_eq!(lhs.cmp(&rhs), Ordering::Less); } @@ -158,47 +85,14 @@ mod tests { // doc1: "bouton" #[test] fn one_typo() { - let doc0 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 1, - attribute: Attribute::new_faillible(0, 1), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(0), - matches: matches, - } - }; + let query_index0 = &[0, 1]; + let distance0 = &[0, 1]; - let doc1 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(1), - matches: matches, - } - }; + let query_index1 = &[0]; + let distance1 = &[0]; - let lhs = sum_matches_typos(&doc0.matches); - let rhs = sum_matches_typos(&doc1.matches); + let lhs = sum_matches_typos(query_index0, distance0); + let rhs = sum_matches_typos(query_index1, distance1); assert_eq!(lhs.cmp(&rhs), Ordering::Equal); } } diff --git a/src/rank/criterion/sum_of_words_attribute.rs b/src/rank/criterion/sum_of_words_attribute.rs index fb4910c51..5c42f8552 100644 --- a/src/rank/criterion/sum_of_words_attribute.rs +++ b/src/rank/criterion/sum_of_words_attribute.rs @@ -1,32 +1,39 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::database::DatabaseView; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn sum_matches_attributes(matches: &[Match]) -> usize { - // note that GroupBy will never return an empty group - // so we can do this assumption safely - matches.linear_group_by(match_query_index).map(|group| { - unsafe { group.get_unchecked(0).attribute.attribute() as usize } - }).sum() +fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { + let mut sum_attributes = 0; + let mut index = 0; + + for group in query_index.linear_group_by(PartialEq::eq) { + sum_attributes += attribute[index] as usize; + index += group.len(); + } + + sum_attributes } #[derive(Debug, Clone, Copy)] pub struct SumOfWordsAttribute; -impl Criterion for SumOfWordsAttribute -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = sum_matches_attributes(&lhs.matches); - let rhs = sum_matches_attributes(&rhs.matches); +impl Criterion for SumOfWordsAttribute { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let attribute = lhs.attribute(); + sum_matches_attributes(query_index, attribute) + }; + + let rhs = { + let query_index = rhs.query_index(); + let attribute = rhs.attribute(); + sum_matches_attributes(query_index, attribute) + }; lhs.cmp(&rhs) } diff --git a/src/rank/criterion/sum_of_words_position.rs b/src/rank/criterion/sum_of_words_position.rs index 0978ac5fd..ad93dc4a8 100644 --- a/src/rank/criterion/sum_of_words_position.rs +++ b/src/rank/criterion/sum_of_words_position.rs @@ -1,32 +1,39 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::database::DatabaseView; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn sum_matches_attribute_index(matches: &[Match]) -> usize { - // note that GroupBy will never return an empty group - // so we can do this assumption safely - matches.linear_group_by(match_query_index).map(|group| { - unsafe { group.get_unchecked(0).attribute.word_index() as usize } - }).sum() +fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize { + let mut sum_word_index = 0; + let mut index = 0; + + for group in query_index.linear_group_by(PartialEq::eq) { + sum_word_index += word_index[index] as usize; + index += group.len(); + } + + sum_word_index } #[derive(Debug, Clone, Copy)] pub struct SumOfWordsPosition; -impl Criterion for SumOfWordsPosition -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = sum_matches_attribute_index(&lhs.matches); - let rhs = sum_matches_attribute_index(&rhs.matches); +impl Criterion for SumOfWordsPosition { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let word_index = lhs.word_index(); + sum_matches_attribute_index(query_index, word_index) + }; + + let rhs = { + let query_index = rhs.query_index(); + let word_index = rhs.word_index(); + sum_matches_attribute_index(query_index, word_index) + }; lhs.cmp(&rhs) } diff --git a/src/rank/criterion/words_proximity.rs b/src/rank/criterion/words_proximity.rs index a61de6b62..614d8f7ff 100644 --- a/src/rank/criterion/words_proximity.rs +++ b/src/rank/criterion/words_proximity.rs @@ -1,16 +1,17 @@ use std::cmp::{self, Ordering}; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; const MAX_DISTANCE: u32 = 8; +#[inline] +fn clone_tuple((a, b): (&T, &U)) -> (T, U) { + (a.clone(), b.clone()) +} + fn index_proximity(lhs: u32, rhs: u32) -> u32 { if lhs < rhs { cmp::min(rhs - lhs, MAX_DISTANCE) @@ -19,30 +20,48 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 { } } -fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 { - if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE } - index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index()) +fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 { + if lattr != rattr { return MAX_DISTANCE } + index_proximity(lwi, rwi) } -fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 { +fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 { let mut min_prox = u32::max_value(); - for a in lhs { - for b in rhs { + for a in lattr.iter().zip(lwi) { + for b in rattr.iter().zip(rwi) { + let a = clone_tuple(a); + let b = clone_tuple(b); min_prox = cmp::min(min_prox, attribute_proximity(a, b)); } } min_prox } -fn matches_proximity(matches: &[Match]) -> u32 { +fn matches_proximity(query_index: &[u32], attribute: &[u16], word_index: &[u32]) -> u32 { let mut proximity = 0; - let mut iter = matches.linear_group_by(match_query_index); - // iterate over groups by windows of size 2 - let mut last = iter.next(); + let mut index = 0; + let mut iter = query_index.linear_group_by(PartialEq::eq); + let mut last = iter.next().map(|group| { + let len = group.len(); + + let rattr = &attribute[index..index + len]; + let rwi = &word_index[index..index + len]; + index += len; + + (rattr, rwi) + }); + while let (Some(lhs), Some(rhs)) = (last, iter.next()) { + let len = rhs.len(); + + let rattr = &attribute[index..index + len]; + let rwi = &word_index[index..index + len]; + let rhs = (rattr, rwi); + proximity += min_proximity(lhs, rhs); last = Some(rhs); + index += len; } proximity @@ -51,24 +70,30 @@ fn matches_proximity(matches: &[Match]) -> u32 { #[derive(Debug, Clone, Copy)] pub struct WordsProximity; -impl Criterion for WordsProximity -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = matches_proximity(&lhs.matches); - let rhs = matches_proximity(&rhs.matches); +impl Criterion for WordsProximity { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let attribute = lhs.attribute(); + let word_index = lhs.word_index(); + matches_proximity(query_index, attribute, word_index) + }; + + let rhs = { + let query_index = rhs.query_index(); + let attribute = rhs.attribute(); + let word_index = rhs.word_index(); + matches_proximity(query_index, attribute, word_index) + }; lhs.cmp(&rhs) } } - #[cfg(test)] mod tests { use super::*; - use crate::Attribute; - #[test] fn three_different_attributes() { @@ -80,18 +105,14 @@ mod tests { // { id: 2, attr: 2, attr_index: 0 } // { id: 3, attr: 3, attr_index: 1 } - let matches = &[ - Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() }, - Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() }, - ]; + let query_index = &[0, 1, 2, 2, 3]; + let attribute = &[0, 1, 1, 2, 3]; + let word_index = &[0, 0, 1, 0, 1]; // soup -> of = 8 // + of -> the = 1 // + the -> day = 8 (not 1) - assert_eq!(matches_proximity(matches), 17); + assert_eq!(matches_proximity(query_index, attribute, word_index), 17); } #[test] @@ -106,57 +127,13 @@ mod tests { // { id: 3, attr: 0, attr_index: 1 } // { id: 3, attr: 1, attr_index: 3 } - let matches = &[ - Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() }, - Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() }, - Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() }, - ]; + let query_index = &[0, 0, 1, 2, 3, 3]; + let attribute = &[0, 1, 1, 1, 0, 1]; + let word_index = &[0, 0, 1, 2, 1, 3]; // soup -> of = 1 // + of -> the = 1 // + the -> day = 1 - assert_eq!(matches_proximity(matches), 3); - } -} - -#[cfg(all(feature = "nightly", test))] -mod bench { - extern crate test; - - use super::*; - use std::error::Error; - use self::test::Bencher; - - use rand_xorshift::XorShiftRng; - use rand::{Rng, SeedableRng}; - - use crate::Attribute; - - #[bench] - fn evaluate_proximity(bench: &mut Bencher) -> Result<(), Box> { - let number_matches = 30_000; - let mut matches = Vec::with_capacity(number_matches); - let mut rng = XorShiftRng::seed_from_u64(42); - - for _ in 0..number_matches { - let query_index = rng.gen_range(0, 4); - - let attribute = rng.gen_range(0, 5); - let word_index = rng.gen_range(0, 15); - let attribute = Attribute::new_faillible(attribute, word_index); - - let match_ = Match { query_index, attribute, ..Match::zero() }; - matches.push(match_); - } - - bench.iter(|| { - let proximity = matches_proximity(&matches); - test::black_box(move || proximity) - }); - - Ok(()) + assert_eq!(matches_proximity(query_index, attribute, word_index), 3); } } diff --git a/src/rank/mod.rs b/src/rank/mod.rs index 4d1b6b1ea..2c5a4bfc3 100644 --- a/src/rank/mod.rs +++ b/src/rank/mod.rs @@ -2,32 +2,182 @@ pub mod criterion; mod query_builder; mod distinct_map; +use std::sync::Arc; + +use slice_group_by::GroupBy; +use rayon::slice::ParallelSliceMut; + use crate::{Match, DocumentId}; pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; -#[inline] -fn match_query_index(a: &Match, b: &Match) -> bool { - a.query_index == b.query_index -} - -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Document { pub id: DocumentId, pub matches: Vec, } impl Document { - pub fn new(doc: DocumentId, match_: Match) -> Self { - unsafe { Self::from_sorted_matches(doc, vec![match_]) } - } + pub fn from_raw(raw: &RawDocument) -> Document { + let len = raw.matches.range.len(); + let mut matches = Vec::with_capacity(len); - pub fn from_matches(doc: DocumentId, mut matches: Vec) -> Self { - matches.sort_unstable(); - unsafe { Self::from_sorted_matches(doc, matches) } - } + let query_index = raw.query_index(); + let distance = raw.distance(); + let attribute = raw.attribute(); + let word_index = raw.word_index(); + let is_exact = raw.is_exact(); + let char_index = raw.char_index(); + let char_length = raw.char_length(); - pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec) -> Self { - Self { id, matches } + for i in 0..len { + let match_ = Match { + query_index: query_index[i], + distance: distance[i], + attribute: attribute[i], + word_index: word_index[i], + is_exact: is_exact[i], + char_index: char_index[i], + char_length: char_length[i], + }; + matches.push(match_); + } + + Document { id: raw.id, matches } + } +} + +#[derive(Clone)] +pub struct RawDocument { + pub id: DocumentId, + pub matches: SharedMatches, +} + +impl RawDocument { + fn new(id: DocumentId, range: Range, matches: Arc) -> RawDocument { + RawDocument { id, matches: SharedMatches { range, matches } } + } + + pub fn query_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } + } + + pub fn distance(&self) -> &[u8] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } + } + + pub fn attribute(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } + } + + pub fn word_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } + } + + pub fn is_exact(&self) -> &[bool] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } + } + + pub fn char_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) } + } + + pub fn char_length(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) } + } +} + +pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec { + let mut docs_ranges = Vec::<(DocumentId, Range)>::new(); + let mut matches2 = Matches::with_capacity(matches.len()); + + matches.par_sort_unstable(); + + for group in matches.linear_group_by(|(a, _), (b, _)| a == b) { + let id = group[0].0; + let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0); + let end = start + group.len(); + docs_ranges.push((id, Range { start, end })); + + matches2.extend_from_slice(group); + } + + let matches = Arc::new(matches2); + docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect() +} + +#[derive(Debug, Copy, Clone)] +struct Range { + start: usize, + end: usize, +} + +impl Range { + fn len(self) -> usize { + self.end - self.start + } +} + +#[derive(Clone)] +pub struct SharedMatches { + range: Range, + matches: Arc, +} + +#[derive(Clone)] +struct Matches { + query_index: Vec, + distance: Vec, + attribute: Vec, + word_index: Vec, + is_exact: Vec, + char_index: Vec, + char_length: Vec, +} + +impl Matches { + fn with_capacity(cap: usize) -> Matches { + Matches { + query_index: Vec::with_capacity(cap), + distance: Vec::with_capacity(cap), + attribute: Vec::with_capacity(cap), + word_index: Vec::with_capacity(cap), + is_exact: Vec::with_capacity(cap), + char_index: Vec::with_capacity(cap), + char_length: Vec::with_capacity(cap), + } + } + + fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) { + for (_, match_) in matches { + self.query_index.push(match_.query_index); + self.distance.push(match_.distance); + self.attribute.push(match_.attribute); + self.word_index.push(match_.word_index); + self.is_exact.push(match_.is_exact); + self.char_index.push(match_.char_index); + self.char_length.push(match_.char_length); + } } } diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index 8146fc7fa..e6c49be6d 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -4,7 +4,9 @@ use std::error::Error; use std::hash::Hash; use std::rc::Rc; +use rayon::slice::ParallelSliceMut; use slice_group_by::GroupByMut; +use elapsed::measure_time; use hashbrown::HashMap; use fst::Streamer; use rocksdb::DB; @@ -15,7 +17,7 @@ use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::rank::criterion::Criteria; use crate::database::DatabaseView; use crate::{Match, DocumentId}; -use crate::rank::Document; +use crate::rank::{raw_documents_from_matches, RawDocument, Document}; fn split_whitespace_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); @@ -41,7 +43,7 @@ pub struct QueryBuilder<'a, D, FI> where D: Deref { view: &'a DatabaseView, - criteria: Criteria, + criteria: Criteria, filter: Option, } @@ -56,7 +58,7 @@ where D: Deref impl<'a, D, FI> QueryBuilder<'a, D, FI> where D: Deref, { - pub fn with_criteria(view: &'a DatabaseView, criteria: Criteria) -> Result> { + pub fn with_criteria(view: &'a DatabaseView, criteria: Criteria) -> Result> { Ok(QueryBuilder { view, criteria, filter: None }) } @@ -81,7 +83,7 @@ where D: Deref, } } - fn query_all(&self, query: &str) -> Vec { + fn query_all(&self, query: &str) -> Vec { let automatons = split_whitespace_automatons(query); let mut stream = { @@ -93,8 +95,7 @@ where D: Deref, op_builder.union() }; - let mut number_matches = 0; - let mut matches = HashMap::new(); + let mut matches = Vec::new(); while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { @@ -105,24 +106,28 @@ where D: Deref, let doc_indexes = &self.view.index().positive.indexes(); let doc_indexes = &doc_indexes[iv.value as usize]; - number_matches += doc_indexes.len(); for doc_index in doc_indexes { let match_ = Match { query_index: iv.index as u32, distance: distance, attribute: doc_index.attribute, + word_index: doc_index.word_index, is_exact: is_exact, - word_area: doc_index.word_area, + char_index: doc_index.char_index, + char_length: doc_index.char_length, }; - matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); + matches.push((doc_index.document_id, match_)); } } } - info!("{} total documents to classify", matches.len()); - info!("{} total matches to classify", number_matches); + let total_matches = matches.len(); + let raw_documents = raw_documents_from_matches(matches); - matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect() + info!("{} total documents to classify", raw_documents.len()); + info!("{} total matches to classify", total_matches); + + raw_documents } } @@ -138,11 +143,10 @@ where D: Deref, return builder.query(query, range); } - let (elapsed, mut documents) = elapsed::measure_time(|| self.query_all(query)); + let (elapsed, mut documents) = measure_time(|| self.query_all(query)); info!("query_all took {}", elapsed); let mut groups = vec![documents.as_mut_slice()]; - let view = &self.view; 'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() { let tmp_groups = mem::replace(&mut groups, Vec::new()); @@ -159,12 +163,12 @@ where D: Deref, continue; } - let (elapsed, ()) = elapsed::measure_time(|| { - group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view)); + let (elapsed, _) = measure_time(|| { + group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); }); info!("criterion {} sort took {}", ci, elapsed); - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, view)) { + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { documents_seen += group.len(); groups.push(group); @@ -175,12 +179,9 @@ where D: Deref, } } - // `drain` removes the documents efficiently using `ptr::copy` - // TODO it could be more efficient to have a custom iterator let offset = cmp::min(documents.len(), range.start); - documents.drain(0..offset); - documents.truncate(range.len()); - documents + let iter = documents.into_iter().skip(offset).take(range.len()); + iter.map(|d| Document::from_raw(&d)).collect() } } @@ -213,7 +214,9 @@ where D: Deref, K: Hash + Eq, { pub fn query(self, query: &str, range: Range) -> Vec { - let mut documents = self.inner.query_all(query); + let (elapsed, mut documents) = measure_time(|| self.inner.query_all(query)); + info!("query_all took {}", elapsed); + let mut groups = vec![documents.as_mut_slice()]; let mut key_cache = HashMap::new(); let view = &self.inner.view; @@ -225,12 +228,14 @@ where D: Deref, let mut distinct_map = DistinctMap::new(self.size); let mut distinct_raw_offset = 0; - 'criteria: for criterion in self.inner.criteria.as_ref() { + 'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() { let tmp_groups = mem::replace(&mut groups, Vec::new()); let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); let mut documents_seen = 0; for group in tmp_groups { + info!("criterion {}, documents group of size {}", ci, group.len()); + // if this group does not overlap with the requested range, // push it without sorting and splitting it if documents_seen + group.len() < distinct_raw_offset { @@ -239,9 +244,12 @@ where D: Deref, continue; } - group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view)); + let (elapsed, _) = measure_time(|| { + group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); + }); + info!("criterion {} sort took {}", ci, elapsed); - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, view)) { + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { // we must compute the real distinguished len of this sub-group for document in group.iter() { let filter_accepted = match &self.inner.filter { @@ -300,7 +308,7 @@ where D: Deref, }; if distinct_accepted && seen.len() > range.start { - out_documents.push(document); + out_documents.push(Document::from_raw(&document)); if out_documents.len() == range.len() { break } } } diff --git a/src/word_area.rs b/src/word_area.rs deleted file mode 100644 index 593b462a6..000000000 --- a/src/word_area.rs +++ /dev/null @@ -1,102 +0,0 @@ -use std::fmt; - -/// Represent a word position in bytes along with the length of it. -/// -/// It can represent words byte index to maximum 2^22 and -/// up to words of length 1024. -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct WordArea(u32); - -impl WordArea { - /// Construct a `WordArea` from a word position in expresed as - /// a number of characters and the length of it. - /// - /// # Panics - /// - /// The char index must not be greater than 2^22 - /// and the length not greater than 1024. - pub(crate) fn new(char_index: u32, length: u16) -> Result { - if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { - return Err(WordAreaError::ByteIndexTooBig) - } - - if length & 0b1111_1100_0000_0000 != 0 { - return Err(WordAreaError::LengthTooBig) - } - - let char_index = char_index << 10; - Ok(WordArea(char_index | u32::from(length))) - } - - pub(crate) fn new_faillible(char_index: u32, length: u16) -> WordArea { - match WordArea::new(char_index, length) { - Ok(word_area) => word_area, - Err(WordAreaError::ByteIndexTooBig) => { - panic!("word area byte index must not be greater than 2^22") - }, - Err(WordAreaError::LengthTooBig) => { - panic!("word area length must not be greater than 1024") - }, - } - } - - pub(crate) fn max_value() -> WordArea { - WordArea(u32::max_value()) - } - - #[inline] - pub fn char_index(self) -> u32 { - self.0 >> 10 - } - - #[inline] - pub fn length(self) -> u16 { - (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16 - } -} - -impl fmt::Debug for WordArea { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("WordArea") - .field("char_index", &self.char_index()) - .field("length", &self.length()) - .finish() - } -} - -pub enum WordAreaError { - ByteIndexTooBig, - LengthTooBig, -} - -#[cfg(test)] -mod tests { - use super::*; - use quickcheck::{quickcheck, TestResult}; - - quickcheck! { - fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult { - if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { - return TestResult::discard() - } - - let word_area = WordArea::new_faillible(gen_char_index, gen_length); - - let valid_char_index = word_area.char_index() == gen_char_index; - let valid_length = word_area.length() == gen_length; - - TestResult::from_bool(valid_char_index && valid_length) - } - - fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult { - if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { - return TestResult::discard() - } - - let a = WordArea::new_faillible(gen_char_index, gen_length); - let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1); - - TestResult::from_bool(a < b) - } - } -}