From a3a28c56fad032cf832bc610b8051d0ffd7c76a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 2 Feb 2019 14:17:50 +0100 Subject: [PATCH 1/6] feat: Replace compressed Match fields by uncompressed ones --- examples/query-database.rs | 10 +++---- src/data/doc_indexes.rs | 36 ++++++++++++++++-------- src/database/serde/indexer_serializer.rs | 22 ++++++--------- src/lib.rs | 26 +++++++++++------ src/rank/query_builder.rs | 4 ++- 5 files changed, 56 insertions(+), 42 deletions(-) diff --git a/examples/query-database.rs b/examples/query-database.rs index 0a8771a51..d1e6a0e17 100644 --- a/examples/query-database.rs +++ b/examples/query-database.rs @@ -70,12 +70,10 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) let mut byte_indexes = BTreeMap::new(); for match_ in matches { - let match_attribute = match_.attribute.attribute(); + let match_attribute = match_.attribute; if SchemaAttr::new(match_attribute) == attribute { - let word_area = match_.word_area; - - let char_index = word_area.char_index() as usize; - let char_length = word_area.length() as usize; + let char_index = match_.char_index as usize; + let char_length = match_.char_length as usize; let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text); match byte_indexes.entry(byte_index) { @@ -151,7 +149,7 @@ fn main() -> Result<(), Box> { let mut matching_attributes = HashSet::new(); for _match in doc.matches { - let attr = SchemaAttr::new(_match.attribute.attribute()); + let attr = SchemaAttr::new(_match.attribute); let name = schema.attribute_name(attr); matching_attributes.insert(name); } diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs index b760765bf..4919b9fa0 100644 --- a/src/data/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -158,18 +158,24 @@ mod tests { fn builder_serialize_deserialize() -> Result<(), Box> { let a = DocIndex { document_id: DocumentId(0), - attribute: Attribute::new_faillible(3, 11), - word_area: WordArea::new_faillible(30, 4) + attribute: 3, + word_index: 11, + char_index: 30, + char_length: 4, }; let b = DocIndex { document_id: DocumentId(1), - attribute: Attribute::new_faillible(4, 21), - word_area: WordArea::new_faillible(35, 6) + attribute: 4, + word_index: 21, + char_index: 35, + char_length: 6, }; let c = DocIndex { document_id: DocumentId(2), - attribute: Attribute::new_faillible(8, 2), - word_area: WordArea::new_faillible(89, 6) + attribute: 8, + word_index: 2, + char_index: 89, + char_length: 6, }; let mut builder = DocIndexesBuilder::memory(); @@ -193,18 +199,24 @@ mod tests { fn serialize_deserialize() -> Result<(), Box> { let a = DocIndex { document_id: DocumentId(0), - attribute: Attribute::new_faillible(3, 11), - word_area: WordArea::new_faillible(30, 4) + attribute: 3, + word_index: 11, + char_index: 30, + char_length: 4, }; let b = DocIndex { document_id: DocumentId(1), - attribute: Attribute::new_faillible(4, 21), - word_area: WordArea::new_faillible(35, 6) + attribute: 4, + word_index: 21, + char_index: 35, + char_length: 6, }; let c = DocIndex { document_id: DocumentId(2), - attribute: Attribute::new_faillible(8, 2), - word_area: WordArea::new_faillible(89, 6) + attribute: 8, + word_index: 2, + char_index: 89, + char_length: 6, }; let mut builder = DocIndexesBuilder::memory(); diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs index bdbfb281d..6271e1b7b 100644 --- a/src/database/serde/indexer_serializer.rs +++ b/src/database/serde/indexer_serializer.rs @@ -54,10 +54,8 @@ where B: TokenizerBuilder let document_id = self.document_id; // FIXME must u32::try_from instead - let attribute = match Attribute::new(self.attribute.0, word_index as u32) { - Ok(attribute) => attribute, - Err(_) => return Ok(()), - }; + let attribute = self.attribute.0; + let word_index = word_index as u32; // insert the exact representation let word_lower = word.to_lowercase(); @@ -68,21 +66,17 @@ where B: TokenizerBuilder // and the unidecoded lowercased version let word_unidecoded = unidecode::unidecode(word).to_lowercase(); if word_lower != word_unidecoded { - let word_area = match WordArea::new(char_index as u32, length) { - Ok(word_area) => word_area, - Err(_) => return Ok(()), - }; + let char_index = char_index as u32; + let char_length = length; - let doc_index = DocIndex { document_id, attribute, word_area }; + let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index); } - let word_area = match WordArea::new(char_index as u32, length) { - Ok(word_area) => word_area, - Err(_) => return Ok(()), - }; + let char_index = char_index as u32; + let char_length = length; - let doc_index = DocIndex { document_id, attribute, word_area }; + let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; self.update.insert_doc_index(word_lower.into_bytes(), doc_index); } Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 19e451f63..5f824b39a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,14 +36,16 @@ pub struct DocIndex { /// The attribute in the document where the word was found /// along with the index in it. - pub attribute: Attribute, + pub attribute: u16, + pub word_index: u32, /// The position in bytes where the word was found /// along with the length of it. /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. - pub word_area: WordArea, + pub char_index: u32, + pub char_length: u16, } /// This structure represent a matching word with informations @@ -68,7 +70,8 @@ pub struct Match { /// The attribute in the document where the word was found /// along with the index in it. - pub attribute: Attribute, + pub attribute: u16, + pub word_index: u32, /// Whether the word that match is an exact match or a prefix. pub is_exact: bool, @@ -78,7 +81,8 @@ pub struct Match { /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. - pub word_area: WordArea, + pub char_index: u32, + pub char_length: u16, } impl Match { @@ -86,9 +90,11 @@ impl Match { Match { query_index: 0, distance: 0, - attribute: Attribute::new_faillible(0, 0), + attribute: 0, + word_index: 0, is_exact: false, - word_area: WordArea::new_faillible(0, 0), + char_index: 0, + char_length: 0, } } @@ -96,9 +102,11 @@ impl Match { Match { query_index: u32::max_value(), distance: u8::max_value(), - attribute: Attribute::max_value(), + attribute: u16::max_value(), + word_index: u32::max_value(), is_exact: true, - word_area: WordArea::max_value(), + char_index: u32::max_value(), + char_length: u16::max_value(), } } } @@ -110,6 +118,6 @@ mod tests { #[test] fn docindex_mem_size() { - assert_eq!(mem::size_of::(), 16); + assert_eq!(mem::size_of::(), 24); } } diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index 8146fc7fa..91d645160 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -111,8 +111,10 @@ where D: Deref, query_index: iv.index as u32, distance: distance, attribute: doc_index.attribute, + word_index: doc_index.word_index, is_exact: is_exact, - word_area: doc_index.word_area, + char_index: doc_index.char_index, + char_length: doc_index.char_length, }; matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); } From 455cbf3bf46ad249da18b398086f52f728d513e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 2 Feb 2019 14:22:31 +0100 Subject: [PATCH 2/6] feat: Make the search algorithm become fully data oriented --- Cargo.toml | 1 + src/rank/criterion/exact.rs | 41 +++-- src/rank/criterion/mod.rs | 7 +- src/rank/criterion/number_of_words.rs | 26 +-- src/rank/criterion/sort_by.rs | 2 +- src/rank/criterion/sum_of_typos.rs | 180 ++++--------------- src/rank/criterion/sum_of_words_attribute.rs | 41 +++-- src/rank/criterion/sum_of_words_position.rs | 41 +++-- src/rank/criterion/words_proximity.rs | 96 ++++++---- src/rank/mod.rs | 180 +++++++++++++++++-- src/rank/query_builder.rs | 46 +++-- 11 files changed, 375 insertions(+), 286 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5e7bba1fb..572cbf2aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ serde_derive = "1.0" serde_json = { version = "1.0", features = ["preserve_order"] } slice-group-by = "0.2" unidecode = "0.3" +rayon = "1.0" [dependencies.toml] git = "https://github.com/Kerollmops/toml-rs.git" diff --git a/src/rank/criterion/exact.rs b/src/rank/criterion/exact.rs index 574649ed6..54b5b7b9f 100644 --- a/src/rank/criterion/exact.rs +++ b/src/rank/criterion/exact.rs @@ -1,33 +1,40 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn contains_exact(matches: &&[Match]) -> bool { - matches.iter().any(|m| m.is_exact) -} +fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { + let mut count = 0; + let mut index = 0; -#[inline] -fn number_exact_matches(matches: &[Match]) -> usize { - matches.linear_group_by(match_query_index).filter(contains_exact).count() + for group in query_index.linear_group_by(PartialEq::eq) { + let len = group.len(); + count += is_exact[index..index + len].contains(&true) as usize; + index += len; + } + + count } #[derive(Debug, Clone, Copy)] pub struct Exact; -impl Criterion for Exact -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = number_exact_matches(&lhs.matches); - let rhs = number_exact_matches(&rhs.matches); +impl Criterion for Exact { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let is_exact = lhs.is_exact(); + number_exact_matches(query_index, is_exact) + }; + + let rhs = { + let query_index = rhs.query_index(); + let is_exact = rhs.is_exact(); + number_exact_matches(query_index, is_exact) + }; lhs.cmp(&rhs).reverse() } diff --git a/src/rank/criterion/mod.rs b/src/rank/criterion/mod.rs index a5dc7ab26..c7c547851 100644 --- a/src/rank/criterion/mod.rs +++ b/src/rank/criterion/mod.rs @@ -4,16 +4,13 @@ mod words_proximity; mod sum_of_words_attribute; mod sum_of_words_position; mod exact; -mod sort_by; +// mod sort_by; mod document_id; use std::cmp::Ordering; -use std::ops::Deref; - -use rocksdb::DB; use crate::database::DatabaseView; -use crate::rank::Document; +use crate::rank::RawDocument; pub use self::{ sum_of_typos::SumOfTypos, diff --git a/src/rank/criterion/number_of_words.rs b/src/rank/criterion/number_of_words.rs index ac9ef9858..c8dd1edb4 100644 --- a/src/rank/criterion/number_of_words.rs +++ b/src/rank/criterion/number_of_words.rs @@ -1,28 +1,28 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn number_of_query_words(matches: &[Match]) -> usize { - matches.linear_group_by(match_query_index).count() +fn number_of_query_words(query_index: &[u32]) -> usize { + query_index.linear_group_by(PartialEq::eq).count() } #[derive(Debug, Clone, Copy)] pub struct NumberOfWords; -impl Criterion for NumberOfWords -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = number_of_query_words(&lhs.matches); - let rhs = number_of_query_words(&rhs.matches); +impl Criterion for NumberOfWords { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + number_of_query_words(query_index) + }; + let rhs = { + let query_index = rhs.query_index(); + number_of_query_words(query_index) + }; lhs.cmp(&rhs).reverse() } diff --git a/src/rank/criterion/sort_by.rs b/src/rank/criterion/sort_by.rs index 8f1fef11c..53b8bcac1 100644 --- a/src/rank/criterion/sort_by.rs +++ b/src/rank/criterion/sort_by.rs @@ -7,7 +7,7 @@ use serde::de::DeserializeOwned; use crate::rank::criterion::Criterion; use crate::database::DatabaseView; -use crate::rank::Document; +use crate::rank::RawDocument; /// An helper struct that permit to sort documents by /// some of their stored attributes. diff --git a/src/rank/criterion/sum_of_typos.rs b/src/rank/criterion/sum_of_typos.rs index be742e787..5d98a42e7 100644 --- a/src/rank/criterion/sum_of_typos.rs +++ b/src/rank/criterion/sum_of_typos.rs @@ -1,24 +1,20 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn sum_matches_typos(matches: &[Match]) -> isize { +fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> isize { let mut sum_typos = 0; let mut number_words = 0; + let mut index = 0; - // note that GroupBy will never return an empty group - // so we can do this assumption safely - for group in matches.linear_group_by(match_query_index) { - sum_typos += unsafe { group.get_unchecked(0).distance as isize }; + for group in query_index.linear_group_by(PartialEq::eq) { + sum_typos += distance[index] as isize; number_words += 1; + index += group.len(); } sum_typos - number_words @@ -27,78 +23,42 @@ fn sum_matches_typos(matches: &[Match]) -> isize { #[derive(Debug, Clone, Copy)] pub struct SumOfTypos; -impl Criterion for SumOfTypos -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = sum_matches_typos(&lhs.matches); - let rhs = sum_matches_typos(&rhs.matches); +impl Criterion for SumOfTypos { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let distance = lhs.distance(); + sum_matches_typos(query_index, distance) + }; + + let rhs = { + let query_index = rhs.query_index(); + let distance = rhs.distance(); + sum_matches_typos(query_index, distance) + }; lhs.cmp(&rhs) } } - #[cfg(test)] mod tests { use super::*; - use crate::{DocumentId, Attribute, WordArea}; - // typing: "Geox CEO" // // doc0: "Geox SpA: CEO and Executive" // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" #[test] fn one_typo_reference() { - let doc0 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 0, - attribute: Attribute::new_faillible(0, 2), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(0), - matches: matches, - } - }; + let query_index0 = &[0, 1]; + let distance0 = &[0, 0]; - let doc1 = { - let matches = vec![ - Match { - query_index: 0, - distance: 1, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 0, - attribute: Attribute::new_faillible(0, 2), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(1), - matches: matches, - } - }; + let query_index1 = &[0, 1]; + let distance1 = &[1, 0]; - let lhs = sum_matches_typos(&doc0.matches); - let rhs = sum_matches_typos(&doc1.matches); + let lhs = sum_matches_typos(query_index0, distance0); + let rhs = sum_matches_typos(query_index1, distance1); assert_eq!(lhs.cmp(&rhs), Ordering::Less); } @@ -108,47 +68,14 @@ mod tests { // doc1: "bouton" #[test] fn no_typo() { - let doc0 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 0, - attribute: Attribute::new_faillible(0, 1), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(0), - matches: matches, - } - }; + let query_index0 = &[0, 1]; + let distance0 = &[0, 0]; - let doc1 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(1), - matches: matches, - } - }; + let query_index1 = &[0]; + let distance1 = &[0]; - let lhs = sum_matches_typos(&doc0.matches); - let rhs = sum_matches_typos(&doc1.matches); + let lhs = sum_matches_typos(query_index0, distance0); + let rhs = sum_matches_typos(query_index1, distance1); assert_eq!(lhs.cmp(&rhs), Ordering::Less); } @@ -158,47 +85,14 @@ mod tests { // doc1: "bouton" #[test] fn one_typo() { - let doc0 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 1, - attribute: Attribute::new_faillible(0, 1), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(0), - matches: matches, - } - }; + let query_index0 = &[0, 1]; + let distance0 = &[0, 1]; - let doc1 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(1), - matches: matches, - } - }; + let query_index1 = &[0]; + let distance1 = &[0]; - let lhs = sum_matches_typos(&doc0.matches); - let rhs = sum_matches_typos(&doc1.matches); + let lhs = sum_matches_typos(query_index0, distance0); + let rhs = sum_matches_typos(query_index1, distance1); assert_eq!(lhs.cmp(&rhs), Ordering::Equal); } } diff --git a/src/rank/criterion/sum_of_words_attribute.rs b/src/rank/criterion/sum_of_words_attribute.rs index fb4910c51..5c42f8552 100644 --- a/src/rank/criterion/sum_of_words_attribute.rs +++ b/src/rank/criterion/sum_of_words_attribute.rs @@ -1,32 +1,39 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::database::DatabaseView; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn sum_matches_attributes(matches: &[Match]) -> usize { - // note that GroupBy will never return an empty group - // so we can do this assumption safely - matches.linear_group_by(match_query_index).map(|group| { - unsafe { group.get_unchecked(0).attribute.attribute() as usize } - }).sum() +fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { + let mut sum_attributes = 0; + let mut index = 0; + + for group in query_index.linear_group_by(PartialEq::eq) { + sum_attributes += attribute[index] as usize; + index += group.len(); + } + + sum_attributes } #[derive(Debug, Clone, Copy)] pub struct SumOfWordsAttribute; -impl Criterion for SumOfWordsAttribute -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = sum_matches_attributes(&lhs.matches); - let rhs = sum_matches_attributes(&rhs.matches); +impl Criterion for SumOfWordsAttribute { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let attribute = lhs.attribute(); + sum_matches_attributes(query_index, attribute) + }; + + let rhs = { + let query_index = rhs.query_index(); + let attribute = rhs.attribute(); + sum_matches_attributes(query_index, attribute) + }; lhs.cmp(&rhs) } diff --git a/src/rank/criterion/sum_of_words_position.rs b/src/rank/criterion/sum_of_words_position.rs index 0978ac5fd..ad93dc4a8 100644 --- a/src/rank/criterion/sum_of_words_position.rs +++ b/src/rank/criterion/sum_of_words_position.rs @@ -1,32 +1,39 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::database::DatabaseView; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn sum_matches_attribute_index(matches: &[Match]) -> usize { - // note that GroupBy will never return an empty group - // so we can do this assumption safely - matches.linear_group_by(match_query_index).map(|group| { - unsafe { group.get_unchecked(0).attribute.word_index() as usize } - }).sum() +fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize { + let mut sum_word_index = 0; + let mut index = 0; + + for group in query_index.linear_group_by(PartialEq::eq) { + sum_word_index += word_index[index] as usize; + index += group.len(); + } + + sum_word_index } #[derive(Debug, Clone, Copy)] pub struct SumOfWordsPosition; -impl Criterion for SumOfWordsPosition -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = sum_matches_attribute_index(&lhs.matches); - let rhs = sum_matches_attribute_index(&rhs.matches); +impl Criterion for SumOfWordsPosition { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let word_index = lhs.word_index(); + sum_matches_attribute_index(query_index, word_index) + }; + + let rhs = { + let query_index = rhs.query_index(); + let word_index = rhs.word_index(); + sum_matches_attribute_index(query_index, word_index) + }; lhs.cmp(&rhs) } diff --git a/src/rank/criterion/words_proximity.rs b/src/rank/criterion/words_proximity.rs index a61de6b62..6f101d4d0 100644 --- a/src/rank/criterion/words_proximity.rs +++ b/src/rank/criterion/words_proximity.rs @@ -1,16 +1,17 @@ use std::cmp::{self, Ordering}; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; const MAX_DISTANCE: u32 = 8; +#[inline] +fn clone_tuple((a, b): (&T, &U)) -> (T, U) { + (a.clone(), b.clone()) +} + fn index_proximity(lhs: u32, rhs: u32) -> u32 { if lhs < rhs { cmp::min(rhs - lhs, MAX_DISTANCE) @@ -19,30 +20,48 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 { } } -fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 { - if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE } - index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index()) +fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 { + if lattr != rattr { return MAX_DISTANCE } + index_proximity(lwi, rwi) } -fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 { +fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 { let mut min_prox = u32::max_value(); - for a in lhs { - for b in rhs { + for a in lattr.iter().zip(lwi) { + for b in rattr.iter().zip(rwi) { + let a = clone_tuple(a); + let b = clone_tuple(b); min_prox = cmp::min(min_prox, attribute_proximity(a, b)); } } min_prox } -fn matches_proximity(matches: &[Match]) -> u32 { +fn matches_proximity(query_index: &[u32], attribute: &[u16], word_index: &[u32]) -> u32 { let mut proximity = 0; - let mut iter = matches.linear_group_by(match_query_index); - // iterate over groups by windows of size 2 - let mut last = iter.next(); + let mut index = 0; + let mut iter = query_index.linear_group_by(PartialEq::eq); + let mut last = iter.next().map(|group| { + let len = group.len(); + + let rattr = &attribute[index..index + len]; + let rwi = &word_index[index..index + len]; + index += len; + + (rattr, rwi) + }); + while let (Some(lhs), Some(rhs)) = (last, iter.next()) { + let len = rhs.len(); + + let rattr = &attribute[index..index + len]; + let rwi = &word_index[index..index + len]; + let rhs = (rattr, rwi); + proximity += min_proximity(lhs, rhs); last = Some(rhs); + index += len; } proximity @@ -51,18 +70,26 @@ fn matches_proximity(matches: &[Match]) -> u32 { #[derive(Debug, Clone, Copy)] pub struct WordsProximity; -impl Criterion for WordsProximity -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = matches_proximity(&lhs.matches); - let rhs = matches_proximity(&rhs.matches); +impl Criterion for WordsProximity { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let attribute = lhs.attribute(); + let word_index = lhs.word_index(); + matches_proximity(query_index, attribute, word_index) + }; + + let rhs = { + let query_index = rhs.query_index(); + let attribute = rhs.attribute(); + let word_index = rhs.word_index(); + matches_proximity(query_index, attribute, word_index) + }; lhs.cmp(&rhs) } } - #[cfg(test)] mod tests { use super::*; @@ -80,18 +107,14 @@ mod tests { // { id: 2, attr: 2, attr_index: 0 } // { id: 3, attr: 3, attr_index: 1 } - let matches = &[ - Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() }, - Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() }, - ]; + let query_index = &[0, 1, 2, 2, 3]; + let attribute = &[0, 1, 1, 2, 3]; + let word_index = &[0, 0, 1, 0, 1]; // soup -> of = 8 // + of -> the = 1 // + the -> day = 8 (not 1) - assert_eq!(matches_proximity(matches), 17); + assert_eq!(matches_proximity(query_index, attribute, word_index), 17); } #[test] @@ -106,19 +129,14 @@ mod tests { // { id: 3, attr: 0, attr_index: 1 } // { id: 3, attr: 1, attr_index: 3 } - let matches = &[ - Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() }, - Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() }, - Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() }, - ]; + let query_index = &[0, 0, 1, 2, 3, 3]; + let attribute = &[0, 1, 1, 1, 0, 1]; + let word_index = &[0, 0, 1, 2, 1, 3]; // soup -> of = 1 // + of -> the = 1 // + the -> day = 1 - assert_eq!(matches_proximity(matches), 3); + assert_eq!(matches_proximity(query_index, attribute, word_index), 3); } } diff --git a/src/rank/mod.rs b/src/rank/mod.rs index 4d1b6b1ea..2c5a4bfc3 100644 --- a/src/rank/mod.rs +++ b/src/rank/mod.rs @@ -2,32 +2,182 @@ pub mod criterion; mod query_builder; mod distinct_map; +use std::sync::Arc; + +use slice_group_by::GroupBy; +use rayon::slice::ParallelSliceMut; + use crate::{Match, DocumentId}; pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; -#[inline] -fn match_query_index(a: &Match, b: &Match) -> bool { - a.query_index == b.query_index -} - -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Document { pub id: DocumentId, pub matches: Vec, } impl Document { - pub fn new(doc: DocumentId, match_: Match) -> Self { - unsafe { Self::from_sorted_matches(doc, vec![match_]) } - } + pub fn from_raw(raw: &RawDocument) -> Document { + let len = raw.matches.range.len(); + let mut matches = Vec::with_capacity(len); - pub fn from_matches(doc: DocumentId, mut matches: Vec) -> Self { - matches.sort_unstable(); - unsafe { Self::from_sorted_matches(doc, matches) } - } + let query_index = raw.query_index(); + let distance = raw.distance(); + let attribute = raw.attribute(); + let word_index = raw.word_index(); + let is_exact = raw.is_exact(); + let char_index = raw.char_index(); + let char_length = raw.char_length(); - pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec) -> Self { - Self { id, matches } + for i in 0..len { + let match_ = Match { + query_index: query_index[i], + distance: distance[i], + attribute: attribute[i], + word_index: word_index[i], + is_exact: is_exact[i], + char_index: char_index[i], + char_length: char_length[i], + }; + matches.push(match_); + } + + Document { id: raw.id, matches } + } +} + +#[derive(Clone)] +pub struct RawDocument { + pub id: DocumentId, + pub matches: SharedMatches, +} + +impl RawDocument { + fn new(id: DocumentId, range: Range, matches: Arc) -> RawDocument { + RawDocument { id, matches: SharedMatches { range, matches } } + } + + pub fn query_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } + } + + pub fn distance(&self) -> &[u8] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } + } + + pub fn attribute(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } + } + + pub fn word_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } + } + + pub fn is_exact(&self) -> &[bool] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } + } + + pub fn char_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) } + } + + pub fn char_length(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) } + } +} + +pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec { + let mut docs_ranges = Vec::<(DocumentId, Range)>::new(); + let mut matches2 = Matches::with_capacity(matches.len()); + + matches.par_sort_unstable(); + + for group in matches.linear_group_by(|(a, _), (b, _)| a == b) { + let id = group[0].0; + let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0); + let end = start + group.len(); + docs_ranges.push((id, Range { start, end })); + + matches2.extend_from_slice(group); + } + + let matches = Arc::new(matches2); + docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect() +} + +#[derive(Debug, Copy, Clone)] +struct Range { + start: usize, + end: usize, +} + +impl Range { + fn len(self) -> usize { + self.end - self.start + } +} + +#[derive(Clone)] +pub struct SharedMatches { + range: Range, + matches: Arc, +} + +#[derive(Clone)] +struct Matches { + query_index: Vec, + distance: Vec, + attribute: Vec, + word_index: Vec, + is_exact: Vec, + char_index: Vec, + char_length: Vec, +} + +impl Matches { + fn with_capacity(cap: usize) -> Matches { + Matches { + query_index: Vec::with_capacity(cap), + distance: Vec::with_capacity(cap), + attribute: Vec::with_capacity(cap), + word_index: Vec::with_capacity(cap), + is_exact: Vec::with_capacity(cap), + char_index: Vec::with_capacity(cap), + char_length: Vec::with_capacity(cap), + } + } + + fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) { + for (_, match_) in matches { + self.query_index.push(match_.query_index); + self.distance.push(match_.distance); + self.attribute.push(match_.attribute); + self.word_index.push(match_.word_index); + self.is_exact.push(match_.is_exact); + self.char_index.push(match_.char_index); + self.char_length.push(match_.char_length); + } } } diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index 91d645160..11fc75498 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -4,7 +4,9 @@ use std::error::Error; use std::hash::Hash; use std::rc::Rc; +use rayon::slice::ParallelSliceMut; use slice_group_by::GroupByMut; +use elapsed::measure_time; use hashbrown::HashMap; use fst::Streamer; use rocksdb::DB; @@ -15,7 +17,7 @@ use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::rank::criterion::Criteria; use crate::database::DatabaseView; use crate::{Match, DocumentId}; -use crate::rank::Document; +use crate::rank::{raw_documents_from_matches, RawDocument, Document}; fn split_whitespace_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); @@ -81,7 +83,7 @@ where D: Deref, } } - fn query_all(&self, query: &str) -> Vec { + fn query_all(&self, query: &str) -> Vec { let automatons = split_whitespace_automatons(query); let mut stream = { @@ -94,7 +96,7 @@ where D: Deref, }; let mut number_matches = 0; - let mut matches = HashMap::new(); + let mut matches = Vec::new(); while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { @@ -105,7 +107,6 @@ where D: Deref, let doc_indexes = &self.view.index().positive.indexes(); let doc_indexes = &doc_indexes[iv.value as usize]; - number_matches += doc_indexes.len(); for doc_index in doc_indexes { let match_ = Match { query_index: iv.index as u32, @@ -116,15 +117,18 @@ where D: Deref, char_index: doc_index.char_index, char_length: doc_index.char_length, }; - matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); + matches.push((doc_index.document_id, match_)); } } } - info!("{} total documents to classify", matches.len()); - info!("{} total matches to classify", number_matches); + let total_matches = matches.len(); + let raw_documents = raw_documents_from_matches(matches); - matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect() + info!("{} total documents to classify", raw_documents.len()); + info!("{} total matches to classify", total_matches); + + raw_documents } } @@ -140,7 +144,7 @@ where D: Deref, return builder.query(query, range); } - let (elapsed, mut documents) = elapsed::measure_time(|| self.query_all(query)); + let (elapsed, mut documents) = measure_time(|| self.query_all(query)); info!("query_all took {}", elapsed); let mut groups = vec![documents.as_mut_slice()]; @@ -177,12 +181,9 @@ where D: Deref, } } - // `drain` removes the documents efficiently using `ptr::copy` - // TODO it could be more efficient to have a custom iterator let offset = cmp::min(documents.len(), range.start); - documents.drain(0..offset); - documents.truncate(range.len()); - documents + let iter = documents.into_iter().skip(offset).take(range.len()); + iter.map(|d| Document::from_raw(&d)).collect() } } @@ -215,7 +216,9 @@ where D: Deref, K: Hash + Eq, { pub fn query(self, query: &str, range: Range) -> Vec { - let mut documents = self.inner.query_all(query); + let (elapsed, mut documents) = measure_time(|| self.inner.query_all(query)); + info!("query_all took {}", elapsed); + let mut groups = vec![documents.as_mut_slice()]; let mut key_cache = HashMap::new(); let view = &self.inner.view; @@ -227,12 +230,14 @@ where D: Deref, let mut distinct_map = DistinctMap::new(self.size); let mut distinct_raw_offset = 0; - 'criteria: for criterion in self.inner.criteria.as_ref() { + 'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() { let tmp_groups = mem::replace(&mut groups, Vec::new()); let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); let mut documents_seen = 0; for group in tmp_groups { + info!("criterion {}, documents group of size {}", ci, group.len()); + // if this group does not overlap with the requested range, // push it without sorting and splitting it if documents_seen + group.len() < distinct_raw_offset { @@ -241,9 +246,12 @@ where D: Deref, continue; } - group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view)); + let (elapsed, _) = measure_time(|| { + group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); + }); + info!("criterion {} sort took {}", ci, elapsed); - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, view)) { + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { // we must compute the real distinguished len of this sub-group for document in group.iter() { let filter_accepted = match &self.inner.filter { @@ -302,7 +310,7 @@ where D: Deref, }; if distinct_accepted && seen.len() > range.start { - out_documents.push(document); + out_documents.push(Document::from_raw(&document)); if out_documents.len() == range.len() { break } } } From 4c0ad5f96460aa5f9482b0651e356582a426748f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 2 Feb 2019 14:23:18 +0100 Subject: [PATCH 3/6] feat: Simplify the Criterion Trait by removing the DatabaseView param --- src/rank/criterion/document_id.rs | 11 ++--- src/rank/criterion/mod.rs | 73 ++++++++++++------------------- src/rank/criterion/sort_by.rs | 2 +- src/rank/query_builder.rs | 10 ++--- 4 files changed, 38 insertions(+), 58 deletions(-) diff --git a/src/rank/criterion/document_id.rs b/src/rank/criterion/document_id.rs index 2d8ca34c2..a388cf2de 100644 --- a/src/rank/criterion/document_id.rs +++ b/src/rank/criterion/document_id.rs @@ -1,19 +1,14 @@ use std::cmp::Ordering; use std::ops::Deref; -use rocksdb::DB; - use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::rank::Document; +use crate::rank::RawDocument; #[derive(Debug, Clone, Copy)] pub struct DocumentId; -impl Criterion for DocumentId -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { +impl Criterion for DocumentId { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { lhs.id.cmp(&rhs.id) } } diff --git a/src/rank/criterion/mod.rs b/src/rank/criterion/mod.rs index c7c547851..46b41ea0f 100644 --- a/src/rank/criterion/mod.rs +++ b/src/rank/criterion/mod.rs @@ -19,60 +19,51 @@ pub use self::{ sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, exact::Exact, - sort_by::SortBy, + // sort_by::SortBy, document_id::DocumentId, }; -pub trait Criterion -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> Ordering; +pub trait Criterion: Send + Sync { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; #[inline] - fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> bool { - self.evaluate(lhs, rhs, view) == Ordering::Equal + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { + self.evaluate(lhs, rhs) == Ordering::Equal } } -impl<'a, D, T: Criterion + ?Sized> Criterion for &'a T -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> Ordering { - (**self).evaluate(lhs, rhs, view) +impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + (**self).evaluate(lhs, rhs) } - fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> bool { - (**self).eq(lhs, rhs, view) + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { + (**self).eq(lhs, rhs) } } -impl + ?Sized> Criterion for Box -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> Ordering { - (**self).evaluate(lhs, rhs, view) +impl Criterion for Box { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + (**self).evaluate(lhs, rhs) } - fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> bool { - (**self).eq(lhs, rhs, view) + fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { + (**self).eq(lhs, rhs) } } #[derive(Default)] -pub struct CriteriaBuilder -where D: Deref -{ - inner: Vec>> +pub struct CriteriaBuilder { + inner: Vec> } -impl CriteriaBuilder -where D: Deref +impl CriteriaBuilder { - pub fn new() -> CriteriaBuilder { + pub fn new() -> CriteriaBuilder { CriteriaBuilder { inner: Vec::new() } } - pub fn with_capacity(capacity: usize) -> CriteriaBuilder { + pub fn with_capacity(capacity: usize) -> CriteriaBuilder { CriteriaBuilder { inner: Vec::with_capacity(capacity) } } @@ -80,33 +71,29 @@ where D: Deref self.inner.reserve(additional) } - pub fn add(mut self, criterion: C) -> CriteriaBuilder - where C: 'static + Criterion, + pub fn add(mut self, criterion: C) -> CriteriaBuilder + where C: 'static + Criterion, { self.push(criterion); self } pub fn push(&mut self, criterion: C) - where C: 'static + Criterion, + where C: 'static + Criterion, { self.inner.push(Box::new(criterion)); } - pub fn build(self) -> Criteria { + pub fn build(self) -> Criteria { Criteria { inner: self.inner } } } -pub struct Criteria -where D: Deref -{ - inner: Vec>>, +pub struct Criteria { + inner: Vec>, } -impl Default for Criteria -where D: Deref -{ +impl Default for Criteria { fn default() -> Self { CriteriaBuilder::with_capacity(7) .add(SumOfTypos) @@ -120,10 +107,8 @@ where D: Deref } } -impl AsRef<[Box>]> for Criteria -where D: Deref -{ - fn as_ref(&self) -> &[Box>] { +impl AsRef<[Box]> for Criteria { + fn as_ref(&self) -> &[Box] { &self.inner } } diff --git a/src/rank/criterion/sort_by.rs b/src/rank/criterion/sort_by.rs index 53b8bcac1..1604a492a 100644 --- a/src/rank/criterion/sort_by.rs +++ b/src/rank/criterion/sort_by.rs @@ -66,7 +66,7 @@ impl Criterion for SortBy where D: Deref, T: DeserializeOwned + Ord, { - fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView) -> Ordering { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument, view: &DatabaseView) -> Ordering { let lhs = match view.document_by_id::(lhs.id) { Ok(doc) => Some(doc), Err(e) => { eprintln!("{}", e); None }, diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index 11fc75498..ff160da7f 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -43,7 +43,7 @@ pub struct QueryBuilder<'a, D, FI> where D: Deref { view: &'a DatabaseView, - criteria: Criteria, + criteria: Criteria, filter: Option, } @@ -58,7 +58,7 @@ where D: Deref impl<'a, D, FI> QueryBuilder<'a, D, FI> where D: Deref, { - pub fn with_criteria(view: &'a DatabaseView, criteria: Criteria) -> Result> { + pub fn with_criteria(view: &'a DatabaseView, criteria: Criteria) -> Result> { Ok(QueryBuilder { view, criteria, filter: None }) } @@ -165,12 +165,12 @@ where D: Deref, continue; } - let (elapsed, ()) = elapsed::measure_time(|| { - group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view)); + let (elapsed, _) = measure_time(|| { + group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); }); info!("criterion {} sort took {}", ci, elapsed); - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, view)) { + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { documents_seen += group.len(); groups.push(group); From 2e905bac089b2f1cc0a2ed3d136446171c6bc6b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 2 Feb 2019 14:28:14 +0100 Subject: [PATCH 4/6] chore: Remove Attribute and WordArea structures --- src/attribute.rs | 105 ----------------------- src/data/doc_indexes.rs | 5 +- src/database/serde/indexer_serializer.rs | 2 +- src/lib.rs | 4 - src/rank/criterion/document_id.rs | 1 - src/rank/criterion/mod.rs | 2 - src/rank/criterion/words_proximity.rs | 2 - src/rank/query_builder.rs | 2 - src/word_area.rs | 102 ---------------------- 9 files changed, 2 insertions(+), 223 deletions(-) delete mode 100644 src/attribute.rs delete mode 100644 src/word_area.rs diff --git a/src/attribute.rs b/src/attribute.rs deleted file mode 100644 index 4c075e475..000000000 --- a/src/attribute.rs +++ /dev/null @@ -1,105 +0,0 @@ -use std::fmt; - -/// Represent an attribute number along with the word index -/// according to the tokenizer used. -/// -/// It can accept up to 1024 attributes and word positions -/// can be maximum 2^22. -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Attribute(u32); - -impl Attribute { - /// Construct an `Attribute` from an attribute number and - /// the word position of a match according to the tokenizer used. - pub(crate) fn new(attribute: u16, index: u32) -> Result { - if attribute & 0b1111_1100_0000_0000 != 0 { - return Err(AttributeError::AttributeTooBig) - } - - if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { - return Err(AttributeError::IndexTooBig) - } - - let attribute = u32::from(attribute) << 22; - Ok(Attribute(attribute | index)) - } - - /// Construct an `Attribute` from an attribute number and - /// the word position of a match according to the tokenizer used. - /// - /// # Panics - /// - /// The attribute must not be greater than 1024 - /// and the word index not greater than 2^22. - pub(crate) fn new_faillible(attribute: u16, index: u32) -> Attribute { - match Attribute::new(attribute, index) { - Ok(attribute) => attribute, - Err(AttributeError::AttributeTooBig) => { - panic!("attribute must not be greater than 1024") - }, - Err(AttributeError::IndexTooBig) => { - panic!("attribute word index must not be greater than 2^22") - }, - } - } - - pub(crate) fn max_value() -> Attribute { - Attribute(u32::max_value()) - } - - #[inline] - pub fn attribute(self) -> u16 { - (self.0 >> 22) as u16 - } - - #[inline] - pub fn word_index(self) -> u32 { - self.0 & 0b0000_0000_0011_1111_1111_1111_1111 - } -} - -impl fmt::Debug for Attribute { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("Attribute") - .field("attribute", &self.attribute()) - .field("word_index", &self.word_index()) - .finish() - } -} - -pub enum AttributeError { - AttributeTooBig, - IndexTooBig, -} - -#[cfg(test)] -mod tests { - use super::*; - use quickcheck::{quickcheck, TestResult}; - - quickcheck! { - fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult { - if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) { - return TestResult::discard() - } - - let attribute = Attribute::new_faillible(gen_attr, gen_index); - - let valid_attribute = attribute.attribute() == gen_attr; - let valid_index = attribute.word_index() == gen_index; - - TestResult::from_bool(valid_attribute && valid_index) - } - - fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult { - if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) { - return TestResult::discard() - } - - let a = Attribute::new_faillible(gen_attr, gen_index); - let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1); - - TestResult::from_bool(a < b) - } - } -} diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs index 4919b9fa0..67106a948 100644 --- a/src/data/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -147,12 +147,9 @@ impl DocIndexesBuilder { #[cfg(test)] mod tests { - use super::*; - use std::error::Error; - use crate::{Attribute, WordArea}; - use crate::DocumentId; + use super::*; #[test] fn builder_serialize_deserialize() -> Result<(), Box> { diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs index 6271e1b7b..63bb016d8 100644 --- a/src/database/serde/indexer_serializer.rs +++ b/src/database/serde/indexer_serializer.rs @@ -3,7 +3,7 @@ use crate::database::serde::SerializerError; use crate::database::schema::SchemaAttr; use crate::tokenizer::TokenizerBuilder; use crate::tokenizer::Token; -use crate::{DocumentId, DocIndex, Attribute, WordArea}; +use crate::{DocumentId, DocIndex}; use hashbrown::HashSet; use serde::Serialize; diff --git a/src/lib.rs b/src/lib.rs index 5f824b39a..bfa0b3cd9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,16 +5,12 @@ pub mod database; pub mod data; pub mod rank; pub mod tokenizer; -mod attribute; -mod word_area; mod common_words; pub use rocksdb; pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; -pub use self::attribute::{Attribute, AttributeError}; -pub use self::word_area::{WordArea, WordAreaError}; /// Represent an internally generated document unique identifier. /// diff --git a/src/rank/criterion/document_id.rs b/src/rank/criterion/document_id.rs index a388cf2de..8e4cf91b5 100644 --- a/src/rank/criterion/document_id.rs +++ b/src/rank/criterion/document_id.rs @@ -1,5 +1,4 @@ use std::cmp::Ordering; -use std::ops::Deref; use crate::rank::criterion::Criterion; use crate::rank::RawDocument; diff --git a/src/rank/criterion/mod.rs b/src/rank/criterion/mod.rs index 46b41ea0f..6272cf89d 100644 --- a/src/rank/criterion/mod.rs +++ b/src/rank/criterion/mod.rs @@ -8,8 +8,6 @@ mod exact; mod document_id; use std::cmp::Ordering; - -use crate::database::DatabaseView; use crate::rank::RawDocument; pub use self::{ diff --git a/src/rank/criterion/words_proximity.rs b/src/rank/criterion/words_proximity.rs index 6f101d4d0..b5d98e147 100644 --- a/src/rank/criterion/words_proximity.rs +++ b/src/rank/criterion/words_proximity.rs @@ -94,8 +94,6 @@ impl Criterion for WordsProximity { mod tests { use super::*; - use crate::Attribute; - #[test] fn three_different_attributes() { diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index ff160da7f..e6c49be6d 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -95,7 +95,6 @@ where D: Deref, op_builder.union() }; - let mut number_matches = 0; let mut matches = Vec::new(); while let Some((input, indexed_values)) = stream.next() { @@ -148,7 +147,6 @@ where D: Deref, info!("query_all took {}", elapsed); let mut groups = vec![documents.as_mut_slice()]; - let view = &self.view; 'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() { let tmp_groups = mem::replace(&mut groups, Vec::new()); diff --git a/src/word_area.rs b/src/word_area.rs deleted file mode 100644 index 593b462a6..000000000 --- a/src/word_area.rs +++ /dev/null @@ -1,102 +0,0 @@ -use std::fmt; - -/// Represent a word position in bytes along with the length of it. -/// -/// It can represent words byte index to maximum 2^22 and -/// up to words of length 1024. -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct WordArea(u32); - -impl WordArea { - /// Construct a `WordArea` from a word position in expresed as - /// a number of characters and the length of it. - /// - /// # Panics - /// - /// The char index must not be greater than 2^22 - /// and the length not greater than 1024. - pub(crate) fn new(char_index: u32, length: u16) -> Result { - if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { - return Err(WordAreaError::ByteIndexTooBig) - } - - if length & 0b1111_1100_0000_0000 != 0 { - return Err(WordAreaError::LengthTooBig) - } - - let char_index = char_index << 10; - Ok(WordArea(char_index | u32::from(length))) - } - - pub(crate) fn new_faillible(char_index: u32, length: u16) -> WordArea { - match WordArea::new(char_index, length) { - Ok(word_area) => word_area, - Err(WordAreaError::ByteIndexTooBig) => { - panic!("word area byte index must not be greater than 2^22") - }, - Err(WordAreaError::LengthTooBig) => { - panic!("word area length must not be greater than 1024") - }, - } - } - - pub(crate) fn max_value() -> WordArea { - WordArea(u32::max_value()) - } - - #[inline] - pub fn char_index(self) -> u32 { - self.0 >> 10 - } - - #[inline] - pub fn length(self) -> u16 { - (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16 - } -} - -impl fmt::Debug for WordArea { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("WordArea") - .field("char_index", &self.char_index()) - .field("length", &self.length()) - .finish() - } -} - -pub enum WordAreaError { - ByteIndexTooBig, - LengthTooBig, -} - -#[cfg(test)] -mod tests { - use super::*; - use quickcheck::{quickcheck, TestResult}; - - quickcheck! { - fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult { - if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { - return TestResult::discard() - } - - let word_area = WordArea::new_faillible(gen_char_index, gen_length); - - let valid_char_index = word_area.char_index() == gen_char_index; - let valid_length = word_area.length() == gen_length; - - TestResult::from_bool(valid_char_index && valid_length) - } - - fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult { - if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { - return TestResult::discard() - } - - let a = WordArea::new_faillible(gen_char_index, gen_length); - let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1); - - TestResult::from_bool(a < b) - } - } -} From 5efbc5ceb3fc4f95064d3f4848a8bce4839d4a80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 2 Feb 2019 14:42:12 +0100 Subject: [PATCH 5/6] feat: Introduce the revisited SortBy criterion --- src/rank/criterion/mod.rs | 4 ++-- src/rank/criterion/sort_by.rs | 43 ++++++++++++++++++----------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/src/rank/criterion/mod.rs b/src/rank/criterion/mod.rs index 6272cf89d..07c6a37e1 100644 --- a/src/rank/criterion/mod.rs +++ b/src/rank/criterion/mod.rs @@ -4,7 +4,7 @@ mod words_proximity; mod sum_of_words_attribute; mod sum_of_words_position; mod exact; -// mod sort_by; +mod sort_by; mod document_id; use std::cmp::Ordering; @@ -17,7 +17,7 @@ pub use self::{ sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, exact::Exact, - // sort_by::SortBy, + sort_by::SortBy, document_id::DocumentId, }; diff --git a/src/rank/criterion/sort_by.rs b/src/rank/criterion/sort_by.rs index 1604a492a..d1c7abf8c 100644 --- a/src/rank/criterion/sort_by.rs +++ b/src/rank/criterion/sort_by.rs @@ -24,7 +24,7 @@ use crate::rank::RawDocument; /// /// # Example /// -/// ```no-test +/// ```ignore /// use serde_derive::Deserialize; /// use meilidb::rank::criterion::*; /// @@ -40,39 +40,40 @@ use crate::rank::RawDocument; /// .add(SumOfWordsAttribute) /// .add(SumOfWordsPosition) /// .add(Exact) -/// .add(SortBy::::new()) +/// .add(SortBy::::new(&view)) /// .add(DocumentId); /// /// let criterion = builder.build(); /// /// ``` -pub struct SortBy { +pub struct SortBy<'a, T, D> +where D: Deref + Send + Sync, + T: Send + Sync +{ + view: &'a DatabaseView, _phantom: marker::PhantomData, } -impl SortBy { - pub fn new() -> Self { - SortBy::default() - } -} - -impl Default for SortBy { - fn default() -> SortBy { - SortBy { _phantom: marker::PhantomData } - } -} - -impl Criterion for SortBy -where D: Deref, - T: DeserializeOwned + Ord, +impl<'a, T, D> SortBy<'a, T, D> +where D: Deref + Send + Sync, + T: Send + Sync { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument, view: &DatabaseView) -> Ordering { - let lhs = match view.document_by_id::(lhs.id) { + pub fn new(view: &'a DatabaseView) -> Self { + SortBy { view, _phantom: marker::PhantomData } + } +} + +impl<'a, T, D> Criterion for SortBy<'a, T, D> +where D: Deref + Send + Sync, + T: DeserializeOwned + Ord + Send + Sync, +{ + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = match self.view.document_by_id::(lhs.id) { Ok(doc) => Some(doc), Err(e) => { eprintln!("{}", e); None }, }; - let rhs = match view.document_by_id::(rhs.id) { + let rhs = match self.view.document_by_id::(rhs.id) { Ok(doc) => Some(doc), Err(e) => { eprintln!("{}", e); None }, }; From 2bd5b4ab86875986733e1bad0af2700db95c0981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 2 Feb 2019 15:12:54 +0100 Subject: [PATCH 6/6] feat: Remove useless WordsProximity criterion benchmark --- src/rank/criterion/words_proximity.rs | 39 --------------------------- 1 file changed, 39 deletions(-) diff --git a/src/rank/criterion/words_proximity.rs b/src/rank/criterion/words_proximity.rs index b5d98e147..614d8f7ff 100644 --- a/src/rank/criterion/words_proximity.rs +++ b/src/rank/criterion/words_proximity.rs @@ -137,42 +137,3 @@ mod tests { assert_eq!(matches_proximity(query_index, attribute, word_index), 3); } } - -#[cfg(all(feature = "nightly", test))] -mod bench { - extern crate test; - - use super::*; - use std::error::Error; - use self::test::Bencher; - - use rand_xorshift::XorShiftRng; - use rand::{Rng, SeedableRng}; - - use crate::Attribute; - - #[bench] - fn evaluate_proximity(bench: &mut Bencher) -> Result<(), Box> { - let number_matches = 30_000; - let mut matches = Vec::with_capacity(number_matches); - let mut rng = XorShiftRng::seed_from_u64(42); - - for _ in 0..number_matches { - let query_index = rng.gen_range(0, 4); - - let attribute = rng.gen_range(0, 5); - let word_index = rng.gen_range(0, 15); - let attribute = Attribute::new_faillible(attribute, word_index); - - let match_ = Match { query_index, attribute, ..Match::zero() }; - matches.push(match_); - } - - bench.iter(|| { - let proximity = matches_proximity(&matches); - test::black_box(move || proximity) - }); - - Ok(()) - } -}