diff --git a/Cargo.toml b/Cargo.toml index 5e7bba1fb..572cbf2aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ serde_derive = "1.0" serde_json = { version = "1.0", features = ["preserve_order"] } slice-group-by = "0.2" unidecode = "0.3" +rayon = "1.0" [dependencies.toml] git = "https://github.com/Kerollmops/toml-rs.git" diff --git a/src/rank/criterion/exact.rs b/src/rank/criterion/exact.rs index 574649ed6..54b5b7b9f 100644 --- a/src/rank/criterion/exact.rs +++ b/src/rank/criterion/exact.rs @@ -1,33 +1,40 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn contains_exact(matches: &&[Match]) -> bool { - matches.iter().any(|m| m.is_exact) -} +fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { + let mut count = 0; + let mut index = 0; -#[inline] -fn number_exact_matches(matches: &[Match]) -> usize { - matches.linear_group_by(match_query_index).filter(contains_exact).count() + for group in query_index.linear_group_by(PartialEq::eq) { + let len = group.len(); + count += is_exact[index..index + len].contains(&true) as usize; + index += len; + } + + count } #[derive(Debug, Clone, Copy)] pub struct Exact; -impl Criterion for Exact -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = number_exact_matches(&lhs.matches); - let rhs = number_exact_matches(&rhs.matches); +impl Criterion for Exact { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let is_exact = lhs.is_exact(); + number_exact_matches(query_index, is_exact) + }; + + let rhs = { + let query_index = rhs.query_index(); + let is_exact = rhs.is_exact(); + number_exact_matches(query_index, is_exact) + }; lhs.cmp(&rhs).reverse() } diff --git a/src/rank/criterion/mod.rs b/src/rank/criterion/mod.rs index a5dc7ab26..c7c547851 100644 --- a/src/rank/criterion/mod.rs +++ b/src/rank/criterion/mod.rs @@ -4,16 +4,13 @@ mod words_proximity; mod sum_of_words_attribute; mod sum_of_words_position; mod exact; -mod sort_by; +// mod sort_by; mod document_id; use std::cmp::Ordering; -use std::ops::Deref; - -use rocksdb::DB; use crate::database::DatabaseView; -use crate::rank::Document; +use crate::rank::RawDocument; pub use self::{ sum_of_typos::SumOfTypos, diff --git a/src/rank/criterion/number_of_words.rs b/src/rank/criterion/number_of_words.rs index ac9ef9858..c8dd1edb4 100644 --- a/src/rank/criterion/number_of_words.rs +++ b/src/rank/criterion/number_of_words.rs @@ -1,28 +1,28 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn number_of_query_words(matches: &[Match]) -> usize { - matches.linear_group_by(match_query_index).count() +fn number_of_query_words(query_index: &[u32]) -> usize { + query_index.linear_group_by(PartialEq::eq).count() } #[derive(Debug, Clone, Copy)] pub struct NumberOfWords; -impl Criterion for NumberOfWords -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = number_of_query_words(&lhs.matches); - let rhs = number_of_query_words(&rhs.matches); +impl Criterion for NumberOfWords { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + number_of_query_words(query_index) + }; + let rhs = { + let query_index = rhs.query_index(); + number_of_query_words(query_index) + }; lhs.cmp(&rhs).reverse() } diff --git a/src/rank/criterion/sort_by.rs b/src/rank/criterion/sort_by.rs index 8f1fef11c..53b8bcac1 100644 --- a/src/rank/criterion/sort_by.rs +++ b/src/rank/criterion/sort_by.rs @@ -7,7 +7,7 @@ use serde::de::DeserializeOwned; use crate::rank::criterion::Criterion; use crate::database::DatabaseView; -use crate::rank::Document; +use crate::rank::RawDocument; /// An helper struct that permit to sort documents by /// some of their stored attributes. diff --git a/src/rank/criterion/sum_of_typos.rs b/src/rank/criterion/sum_of_typos.rs index be742e787..5d98a42e7 100644 --- a/src/rank/criterion/sum_of_typos.rs +++ b/src/rank/criterion/sum_of_typos.rs @@ -1,24 +1,20 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn sum_matches_typos(matches: &[Match]) -> isize { +fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> isize { let mut sum_typos = 0; let mut number_words = 0; + let mut index = 0; - // note that GroupBy will never return an empty group - // so we can do this assumption safely - for group in matches.linear_group_by(match_query_index) { - sum_typos += unsafe { group.get_unchecked(0).distance as isize }; + for group in query_index.linear_group_by(PartialEq::eq) { + sum_typos += distance[index] as isize; number_words += 1; + index += group.len(); } sum_typos - number_words @@ -27,78 +23,42 @@ fn sum_matches_typos(matches: &[Match]) -> isize { #[derive(Debug, Clone, Copy)] pub struct SumOfTypos; -impl Criterion for SumOfTypos -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = sum_matches_typos(&lhs.matches); - let rhs = sum_matches_typos(&rhs.matches); +impl Criterion for SumOfTypos { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let distance = lhs.distance(); + sum_matches_typos(query_index, distance) + }; + + let rhs = { + let query_index = rhs.query_index(); + let distance = rhs.distance(); + sum_matches_typos(query_index, distance) + }; lhs.cmp(&rhs) } } - #[cfg(test)] mod tests { use super::*; - use crate::{DocumentId, Attribute, WordArea}; - // typing: "Geox CEO" // // doc0: "Geox SpA: CEO and Executive" // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" #[test] fn one_typo_reference() { - let doc0 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 0, - attribute: Attribute::new_faillible(0, 2), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(0), - matches: matches, - } - }; + let query_index0 = &[0, 1]; + let distance0 = &[0, 0]; - let doc1 = { - let matches = vec![ - Match { - query_index: 0, - distance: 1, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 0, - attribute: Attribute::new_faillible(0, 2), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(1), - matches: matches, - } - }; + let query_index1 = &[0, 1]; + let distance1 = &[1, 0]; - let lhs = sum_matches_typos(&doc0.matches); - let rhs = sum_matches_typos(&doc1.matches); + let lhs = sum_matches_typos(query_index0, distance0); + let rhs = sum_matches_typos(query_index1, distance1); assert_eq!(lhs.cmp(&rhs), Ordering::Less); } @@ -108,47 +68,14 @@ mod tests { // doc1: "bouton" #[test] fn no_typo() { - let doc0 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 0, - attribute: Attribute::new_faillible(0, 1), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(0), - matches: matches, - } - }; + let query_index0 = &[0, 1]; + let distance0 = &[0, 0]; - let doc1 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(1), - matches: matches, - } - }; + let query_index1 = &[0]; + let distance1 = &[0]; - let lhs = sum_matches_typos(&doc0.matches); - let rhs = sum_matches_typos(&doc1.matches); + let lhs = sum_matches_typos(query_index0, distance0); + let rhs = sum_matches_typos(query_index1, distance1); assert_eq!(lhs.cmp(&rhs), Ordering::Less); } @@ -158,47 +85,14 @@ mod tests { // doc1: "bouton" #[test] fn one_typo() { - let doc0 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - Match { - query_index: 1, - distance: 1, - attribute: Attribute::new_faillible(0, 1), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(0), - matches: matches, - } - }; + let query_index0 = &[0, 1]; + let distance0 = &[0, 1]; - let doc1 = { - let matches = vec![ - Match { - query_index: 0, - distance: 0, - attribute: Attribute::new_faillible(0, 0), - is_exact: false, - word_area: WordArea::new_faillible(0, 6) - }, - ]; - Document { - id: DocumentId(1), - matches: matches, - } - }; + let query_index1 = &[0]; + let distance1 = &[0]; - let lhs = sum_matches_typos(&doc0.matches); - let rhs = sum_matches_typos(&doc1.matches); + let lhs = sum_matches_typos(query_index0, distance0); + let rhs = sum_matches_typos(query_index1, distance1); assert_eq!(lhs.cmp(&rhs), Ordering::Equal); } } diff --git a/src/rank/criterion/sum_of_words_attribute.rs b/src/rank/criterion/sum_of_words_attribute.rs index fb4910c51..5c42f8552 100644 --- a/src/rank/criterion/sum_of_words_attribute.rs +++ b/src/rank/criterion/sum_of_words_attribute.rs @@ -1,32 +1,39 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::database::DatabaseView; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn sum_matches_attributes(matches: &[Match]) -> usize { - // note that GroupBy will never return an empty group - // so we can do this assumption safely - matches.linear_group_by(match_query_index).map(|group| { - unsafe { group.get_unchecked(0).attribute.attribute() as usize } - }).sum() +fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { + let mut sum_attributes = 0; + let mut index = 0; + + for group in query_index.linear_group_by(PartialEq::eq) { + sum_attributes += attribute[index] as usize; + index += group.len(); + } + + sum_attributes } #[derive(Debug, Clone, Copy)] pub struct SumOfWordsAttribute; -impl Criterion for SumOfWordsAttribute -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = sum_matches_attributes(&lhs.matches); - let rhs = sum_matches_attributes(&rhs.matches); +impl Criterion for SumOfWordsAttribute { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let attribute = lhs.attribute(); + sum_matches_attributes(query_index, attribute) + }; + + let rhs = { + let query_index = rhs.query_index(); + let attribute = rhs.attribute(); + sum_matches_attributes(query_index, attribute) + }; lhs.cmp(&rhs) } diff --git a/src/rank/criterion/sum_of_words_position.rs b/src/rank/criterion/sum_of_words_position.rs index 0978ac5fd..ad93dc4a8 100644 --- a/src/rank/criterion/sum_of_words_position.rs +++ b/src/rank/criterion/sum_of_words_position.rs @@ -1,32 +1,39 @@ use std::cmp::Ordering; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::database::DatabaseView; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::Match; +use crate::rank::RawDocument; #[inline] -fn sum_matches_attribute_index(matches: &[Match]) -> usize { - // note that GroupBy will never return an empty group - // so we can do this assumption safely - matches.linear_group_by(match_query_index).map(|group| { - unsafe { group.get_unchecked(0).attribute.word_index() as usize } - }).sum() +fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize { + let mut sum_word_index = 0; + let mut index = 0; + + for group in query_index.linear_group_by(PartialEq::eq) { + sum_word_index += word_index[index] as usize; + index += group.len(); + } + + sum_word_index } #[derive(Debug, Clone, Copy)] pub struct SumOfWordsPosition; -impl Criterion for SumOfWordsPosition -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = sum_matches_attribute_index(&lhs.matches); - let rhs = sum_matches_attribute_index(&rhs.matches); +impl Criterion for SumOfWordsPosition { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let word_index = lhs.word_index(); + sum_matches_attribute_index(query_index, word_index) + }; + + let rhs = { + let query_index = rhs.query_index(); + let word_index = rhs.word_index(); + sum_matches_attribute_index(query_index, word_index) + }; lhs.cmp(&rhs) } diff --git a/src/rank/criterion/words_proximity.rs b/src/rank/criterion/words_proximity.rs index a61de6b62..6f101d4d0 100644 --- a/src/rank/criterion/words_proximity.rs +++ b/src/rank/criterion/words_proximity.rs @@ -1,16 +1,17 @@ use std::cmp::{self, Ordering}; -use std::ops::Deref; -use rocksdb::DB; use slice_group_by::GroupBy; -use crate::rank::{match_query_index, Document}; use crate::rank::criterion::Criterion; -use crate::database::DatabaseView; -use crate::Match; +use crate::rank::RawDocument; const MAX_DISTANCE: u32 = 8; +#[inline] +fn clone_tuple((a, b): (&T, &U)) -> (T, U) { + (a.clone(), b.clone()) +} + fn index_proximity(lhs: u32, rhs: u32) -> u32 { if lhs < rhs { cmp::min(rhs - lhs, MAX_DISTANCE) @@ -19,30 +20,48 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 { } } -fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 { - if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE } - index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index()) +fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 { + if lattr != rattr { return MAX_DISTANCE } + index_proximity(lwi, rwi) } -fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 { +fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 { let mut min_prox = u32::max_value(); - for a in lhs { - for b in rhs { + for a in lattr.iter().zip(lwi) { + for b in rattr.iter().zip(rwi) { + let a = clone_tuple(a); + let b = clone_tuple(b); min_prox = cmp::min(min_prox, attribute_proximity(a, b)); } } min_prox } -fn matches_proximity(matches: &[Match]) -> u32 { +fn matches_proximity(query_index: &[u32], attribute: &[u16], word_index: &[u32]) -> u32 { let mut proximity = 0; - let mut iter = matches.linear_group_by(match_query_index); - // iterate over groups by windows of size 2 - let mut last = iter.next(); + let mut index = 0; + let mut iter = query_index.linear_group_by(PartialEq::eq); + let mut last = iter.next().map(|group| { + let len = group.len(); + + let rattr = &attribute[index..index + len]; + let rwi = &word_index[index..index + len]; + index += len; + + (rattr, rwi) + }); + while let (Some(lhs), Some(rhs)) = (last, iter.next()) { + let len = rhs.len(); + + let rattr = &attribute[index..index + len]; + let rwi = &word_index[index..index + len]; + let rhs = (rattr, rwi); + proximity += min_proximity(lhs, rhs); last = Some(rhs); + index += len; } proximity @@ -51,18 +70,26 @@ fn matches_proximity(matches: &[Match]) -> u32 { #[derive(Debug, Clone, Copy)] pub struct WordsProximity; -impl Criterion for WordsProximity -where D: Deref -{ - fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView) -> Ordering { - let lhs = matches_proximity(&lhs.matches); - let rhs = matches_proximity(&rhs.matches); +impl Criterion for WordsProximity { + fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { + let lhs = { + let query_index = lhs.query_index(); + let attribute = lhs.attribute(); + let word_index = lhs.word_index(); + matches_proximity(query_index, attribute, word_index) + }; + + let rhs = { + let query_index = rhs.query_index(); + let attribute = rhs.attribute(); + let word_index = rhs.word_index(); + matches_proximity(query_index, attribute, word_index) + }; lhs.cmp(&rhs) } } - #[cfg(test)] mod tests { use super::*; @@ -80,18 +107,14 @@ mod tests { // { id: 2, attr: 2, attr_index: 0 } // { id: 3, attr: 3, attr_index: 1 } - let matches = &[ - Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() }, - Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() }, - ]; + let query_index = &[0, 1, 2, 2, 3]; + let attribute = &[0, 1, 1, 2, 3]; + let word_index = &[0, 0, 1, 0, 1]; // soup -> of = 8 // + of -> the = 1 // + the -> day = 8 (not 1) - assert_eq!(matches_proximity(matches), 17); + assert_eq!(matches_proximity(query_index, attribute, word_index), 17); } #[test] @@ -106,19 +129,14 @@ mod tests { // { id: 3, attr: 0, attr_index: 1 } // { id: 3, attr: 1, attr_index: 3 } - let matches = &[ - Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() }, - Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() }, - Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() }, - ]; + let query_index = &[0, 0, 1, 2, 3, 3]; + let attribute = &[0, 1, 1, 1, 0, 1]; + let word_index = &[0, 0, 1, 2, 1, 3]; // soup -> of = 1 // + of -> the = 1 // + the -> day = 1 - assert_eq!(matches_proximity(matches), 3); + assert_eq!(matches_proximity(query_index, attribute, word_index), 3); } } diff --git a/src/rank/mod.rs b/src/rank/mod.rs index 4d1b6b1ea..2c5a4bfc3 100644 --- a/src/rank/mod.rs +++ b/src/rank/mod.rs @@ -2,32 +2,182 @@ pub mod criterion; mod query_builder; mod distinct_map; +use std::sync::Arc; + +use slice_group_by::GroupBy; +use rayon::slice::ParallelSliceMut; + use crate::{Match, DocumentId}; pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; -#[inline] -fn match_query_index(a: &Match, b: &Match) -> bool { - a.query_index == b.query_index -} - -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Document { pub id: DocumentId, pub matches: Vec, } impl Document { - pub fn new(doc: DocumentId, match_: Match) -> Self { - unsafe { Self::from_sorted_matches(doc, vec![match_]) } - } + pub fn from_raw(raw: &RawDocument) -> Document { + let len = raw.matches.range.len(); + let mut matches = Vec::with_capacity(len); - pub fn from_matches(doc: DocumentId, mut matches: Vec) -> Self { - matches.sort_unstable(); - unsafe { Self::from_sorted_matches(doc, matches) } - } + let query_index = raw.query_index(); + let distance = raw.distance(); + let attribute = raw.attribute(); + let word_index = raw.word_index(); + let is_exact = raw.is_exact(); + let char_index = raw.char_index(); + let char_length = raw.char_length(); - pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec) -> Self { - Self { id, matches } + for i in 0..len { + let match_ = Match { + query_index: query_index[i], + distance: distance[i], + attribute: attribute[i], + word_index: word_index[i], + is_exact: is_exact[i], + char_index: char_index[i], + char_length: char_length[i], + }; + matches.push(match_); + } + + Document { id: raw.id, matches } + } +} + +#[derive(Clone)] +pub struct RawDocument { + pub id: DocumentId, + pub matches: SharedMatches, +} + +impl RawDocument { + fn new(id: DocumentId, range: Range, matches: Arc) -> RawDocument { + RawDocument { id, matches: SharedMatches { range, matches } } + } + + pub fn query_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } + } + + pub fn distance(&self) -> &[u8] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } + } + + pub fn attribute(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } + } + + pub fn word_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } + } + + pub fn is_exact(&self) -> &[bool] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } + } + + pub fn char_index(&self) -> &[u32] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) } + } + + pub fn char_length(&self) -> &[u16] { + let r = self.matches.range; + // it is safe because construction/modifications + // can only be done in this module + unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) } + } +} + +pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec { + let mut docs_ranges = Vec::<(DocumentId, Range)>::new(); + let mut matches2 = Matches::with_capacity(matches.len()); + + matches.par_sort_unstable(); + + for group in matches.linear_group_by(|(a, _), (b, _)| a == b) { + let id = group[0].0; + let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0); + let end = start + group.len(); + docs_ranges.push((id, Range { start, end })); + + matches2.extend_from_slice(group); + } + + let matches = Arc::new(matches2); + docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect() +} + +#[derive(Debug, Copy, Clone)] +struct Range { + start: usize, + end: usize, +} + +impl Range { + fn len(self) -> usize { + self.end - self.start + } +} + +#[derive(Clone)] +pub struct SharedMatches { + range: Range, + matches: Arc, +} + +#[derive(Clone)] +struct Matches { + query_index: Vec, + distance: Vec, + attribute: Vec, + word_index: Vec, + is_exact: Vec, + char_index: Vec, + char_length: Vec, +} + +impl Matches { + fn with_capacity(cap: usize) -> Matches { + Matches { + query_index: Vec::with_capacity(cap), + distance: Vec::with_capacity(cap), + attribute: Vec::with_capacity(cap), + word_index: Vec::with_capacity(cap), + is_exact: Vec::with_capacity(cap), + char_index: Vec::with_capacity(cap), + char_length: Vec::with_capacity(cap), + } + } + + fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) { + for (_, match_) in matches { + self.query_index.push(match_.query_index); + self.distance.push(match_.distance); + self.attribute.push(match_.attribute); + self.word_index.push(match_.word_index); + self.is_exact.push(match_.is_exact); + self.char_index.push(match_.char_index); + self.char_length.push(match_.char_length); + } } } diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index 91d645160..11fc75498 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -4,7 +4,9 @@ use std::error::Error; use std::hash::Hash; use std::rc::Rc; +use rayon::slice::ParallelSliceMut; use slice_group_by::GroupByMut; +use elapsed::measure_time; use hashbrown::HashMap; use fst::Streamer; use rocksdb::DB; @@ -15,7 +17,7 @@ use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::rank::criterion::Criteria; use crate::database::DatabaseView; use crate::{Match, DocumentId}; -use crate::rank::Document; +use crate::rank::{raw_documents_from_matches, RawDocument, Document}; fn split_whitespace_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); @@ -81,7 +83,7 @@ where D: Deref, } } - fn query_all(&self, query: &str) -> Vec { + fn query_all(&self, query: &str) -> Vec { let automatons = split_whitespace_automatons(query); let mut stream = { @@ -94,7 +96,7 @@ where D: Deref, }; let mut number_matches = 0; - let mut matches = HashMap::new(); + let mut matches = Vec::new(); while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { @@ -105,7 +107,6 @@ where D: Deref, let doc_indexes = &self.view.index().positive.indexes(); let doc_indexes = &doc_indexes[iv.value as usize]; - number_matches += doc_indexes.len(); for doc_index in doc_indexes { let match_ = Match { query_index: iv.index as u32, @@ -116,15 +117,18 @@ where D: Deref, char_index: doc_index.char_index, char_length: doc_index.char_length, }; - matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); + matches.push((doc_index.document_id, match_)); } } } - info!("{} total documents to classify", matches.len()); - info!("{} total matches to classify", number_matches); + let total_matches = matches.len(); + let raw_documents = raw_documents_from_matches(matches); - matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect() + info!("{} total documents to classify", raw_documents.len()); + info!("{} total matches to classify", total_matches); + + raw_documents } } @@ -140,7 +144,7 @@ where D: Deref, return builder.query(query, range); } - let (elapsed, mut documents) = elapsed::measure_time(|| self.query_all(query)); + let (elapsed, mut documents) = measure_time(|| self.query_all(query)); info!("query_all took {}", elapsed); let mut groups = vec![documents.as_mut_slice()]; @@ -177,12 +181,9 @@ where D: Deref, } } - // `drain` removes the documents efficiently using `ptr::copy` - // TODO it could be more efficient to have a custom iterator let offset = cmp::min(documents.len(), range.start); - documents.drain(0..offset); - documents.truncate(range.len()); - documents + let iter = documents.into_iter().skip(offset).take(range.len()); + iter.map(|d| Document::from_raw(&d)).collect() } } @@ -215,7 +216,9 @@ where D: Deref, K: Hash + Eq, { pub fn query(self, query: &str, range: Range) -> Vec { - let mut documents = self.inner.query_all(query); + let (elapsed, mut documents) = measure_time(|| self.inner.query_all(query)); + info!("query_all took {}", elapsed); + let mut groups = vec![documents.as_mut_slice()]; let mut key_cache = HashMap::new(); let view = &self.inner.view; @@ -227,12 +230,14 @@ where D: Deref, let mut distinct_map = DistinctMap::new(self.size); let mut distinct_raw_offset = 0; - 'criteria: for criterion in self.inner.criteria.as_ref() { + 'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() { let tmp_groups = mem::replace(&mut groups, Vec::new()); let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); let mut documents_seen = 0; for group in tmp_groups { + info!("criterion {}, documents group of size {}", ci, group.len()); + // if this group does not overlap with the requested range, // push it without sorting and splitting it if documents_seen + group.len() < distinct_raw_offset { @@ -241,9 +246,12 @@ where D: Deref, continue; } - group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view)); + let (elapsed, _) = measure_time(|| { + group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); + }); + info!("criterion {} sort took {}", ci, elapsed); - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, view)) { + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { // we must compute the real distinguished len of this sub-group for document in group.iter() { let filter_accepted = match &self.inner.filter { @@ -302,7 +310,7 @@ where D: Deref, }; if distinct_accepted && seen.len() > range.start { - out_documents.push(document); + out_documents.push(Document::from_raw(&document)); if out_documents.len() == range.len() { break } } }