Merge pull request #94 from Kerollmops/data-oriented

Introduce Data Oriented design into the search algorithm
This commit is contained in:
Clément Renault 2019-02-02 15:40:10 +01:00 committed by GitHub
commit d46fa4b215
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 490 additions and 666 deletions

View File

@ -21,6 +21,7 @@ serde_derive = "1.0"
serde_json = { version = "1.0", features = ["preserve_order"] } serde_json = { version = "1.0", features = ["preserve_order"] }
slice-group-by = "0.2" slice-group-by = "0.2"
unidecode = "0.3" unidecode = "0.3"
rayon = "1.0"
[dependencies.toml] [dependencies.toml]
git = "https://github.com/Kerollmops/toml-rs.git" git = "https://github.com/Kerollmops/toml-rs.git"

View File

@ -70,12 +70,10 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr)
let mut byte_indexes = BTreeMap::new(); let mut byte_indexes = BTreeMap::new();
for match_ in matches { for match_ in matches {
let match_attribute = match_.attribute.attribute(); let match_attribute = match_.attribute;
if SchemaAttr::new(match_attribute) == attribute { if SchemaAttr::new(match_attribute) == attribute {
let word_area = match_.word_area; let char_index = match_.char_index as usize;
let char_length = match_.char_length as usize;
let char_index = word_area.char_index() as usize;
let char_length = word_area.length() as usize;
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text); let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) { match byte_indexes.entry(byte_index) {
@ -151,7 +149,7 @@ fn main() -> Result<(), Box<Error>> {
let mut matching_attributes = HashSet::new(); let mut matching_attributes = HashSet::new();
for _match in doc.matches { for _match in doc.matches {
let attr = SchemaAttr::new(_match.attribute.attribute()); let attr = SchemaAttr::new(_match.attribute);
let name = schema.attribute_name(attr); let name = schema.attribute_name(attr);
matching_attributes.insert(name); matching_attributes.insert(name);
} }

View File

@ -1,105 +0,0 @@
use std::fmt;
/// Represent an attribute number along with the word index
/// according to the tokenizer used.
///
/// It can accept up to 1024 attributes and word positions
/// can be maximum 2^22.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Attribute(u32);
impl Attribute {
/// Construct an `Attribute` from an attribute number and
/// the word position of a match according to the tokenizer used.
pub(crate) fn new(attribute: u16, index: u32) -> Result<Attribute, AttributeError> {
if attribute & 0b1111_1100_0000_0000 != 0 {
return Err(AttributeError::AttributeTooBig)
}
if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
return Err(AttributeError::IndexTooBig)
}
let attribute = u32::from(attribute) << 22;
Ok(Attribute(attribute | index))
}
/// Construct an `Attribute` from an attribute number and
/// the word position of a match according to the tokenizer used.
///
/// # Panics
///
/// The attribute must not be greater than 1024
/// and the word index not greater than 2^22.
pub(crate) fn new_faillible(attribute: u16, index: u32) -> Attribute {
match Attribute::new(attribute, index) {
Ok(attribute) => attribute,
Err(AttributeError::AttributeTooBig) => {
panic!("attribute must not be greater than 1024")
},
Err(AttributeError::IndexTooBig) => {
panic!("attribute word index must not be greater than 2^22")
},
}
}
pub(crate) fn max_value() -> Attribute {
Attribute(u32::max_value())
}
#[inline]
pub fn attribute(self) -> u16 {
(self.0 >> 22) as u16
}
#[inline]
pub fn word_index(self) -> u32 {
self.0 & 0b0000_0000_0011_1111_1111_1111_1111
}
}
impl fmt::Debug for Attribute {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Attribute")
.field("attribute", &self.attribute())
.field("word_index", &self.word_index())
.finish()
}
}
pub enum AttributeError {
AttributeTooBig,
IndexTooBig,
}
#[cfg(test)]
mod tests {
use super::*;
use quickcheck::{quickcheck, TestResult};
quickcheck! {
fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult {
if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) {
return TestResult::discard()
}
let attribute = Attribute::new_faillible(gen_attr, gen_index);
let valid_attribute = attribute.attribute() == gen_attr;
let valid_index = attribute.word_index() == gen_index;
TestResult::from_bool(valid_attribute && valid_index)
}
fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult {
if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) {
return TestResult::discard()
}
let a = Attribute::new_faillible(gen_attr, gen_index);
let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1);
TestResult::from_bool(a < b)
}
}
}

View File

@ -147,29 +147,32 @@ impl<W: Write> DocIndexesBuilder<W> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*;
use std::error::Error; use std::error::Error;
use crate::{Attribute, WordArea};
use crate::DocumentId; use crate::DocumentId;
use super::*;
#[test] #[test]
fn builder_serialize_deserialize() -> Result<(), Box<Error>> { fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { let a = DocIndex {
document_id: DocumentId(0), document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11), attribute: 3,
word_area: WordArea::new_faillible(30, 4) word_index: 11,
char_index: 30,
char_length: 4,
}; };
let b = DocIndex { let b = DocIndex {
document_id: DocumentId(1), document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21), attribute: 4,
word_area: WordArea::new_faillible(35, 6) word_index: 21,
char_index: 35,
char_length: 6,
}; };
let c = DocIndex { let c = DocIndex {
document_id: DocumentId(2), document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2), attribute: 8,
word_area: WordArea::new_faillible(89, 6) word_index: 2,
char_index: 89,
char_length: 6,
}; };
let mut builder = DocIndexesBuilder::memory(); let mut builder = DocIndexesBuilder::memory();
@ -193,18 +196,24 @@ mod tests {
fn serialize_deserialize() -> Result<(), Box<Error>> { fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { let a = DocIndex {
document_id: DocumentId(0), document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11), attribute: 3,
word_area: WordArea::new_faillible(30, 4) word_index: 11,
char_index: 30,
char_length: 4,
}; };
let b = DocIndex { let b = DocIndex {
document_id: DocumentId(1), document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21), attribute: 4,
word_area: WordArea::new_faillible(35, 6) word_index: 21,
char_index: 35,
char_length: 6,
}; };
let c = DocIndex { let c = DocIndex {
document_id: DocumentId(2), document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2), attribute: 8,
word_area: WordArea::new_faillible(89, 6) word_index: 2,
char_index: 89,
char_length: 6,
}; };
let mut builder = DocIndexesBuilder::memory(); let mut builder = DocIndexesBuilder::memory();

View File

@ -3,7 +3,7 @@ use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr; use crate::database::schema::SchemaAttr;
use crate::tokenizer::TokenizerBuilder; use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token; use crate::tokenizer::Token;
use crate::{DocumentId, DocIndex, Attribute, WordArea}; use crate::{DocumentId, DocIndex};
use hashbrown::HashSet; use hashbrown::HashSet;
use serde::Serialize; use serde::Serialize;
@ -54,10 +54,8 @@ where B: TokenizerBuilder
let document_id = self.document_id; let document_id = self.document_id;
// FIXME must u32::try_from instead // FIXME must u32::try_from instead
let attribute = match Attribute::new(self.attribute.0, word_index as u32) { let attribute = self.attribute.0;
Ok(attribute) => attribute, let word_index = word_index as u32;
Err(_) => return Ok(()),
};
// insert the exact representation // insert the exact representation
let word_lower = word.to_lowercase(); let word_lower = word.to_lowercase();
@ -68,21 +66,17 @@ where B: TokenizerBuilder
// and the unidecoded lowercased version // and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase(); let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded { if word_lower != word_unidecoded {
let word_area = match WordArea::new(char_index as u32, length) { let char_index = char_index as u32;
Ok(word_area) => word_area, let char_length = length;
Err(_) => return Ok(()),
};
let doc_index = DocIndex { document_id, attribute, word_area }; let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index); self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
} }
let word_area = match WordArea::new(char_index as u32, length) { let char_index = char_index as u32;
Ok(word_area) => word_area, let char_length = length;
Err(_) => return Ok(()),
};
let doc_index = DocIndex { document_id, attribute, word_area }; let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
self.update.insert_doc_index(word_lower.into_bytes(), doc_index); self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
} }
Ok(()) Ok(())

View File

@ -5,16 +5,12 @@ pub mod database;
pub mod data; pub mod data;
pub mod rank; pub mod rank;
pub mod tokenizer; pub mod tokenizer;
mod attribute;
mod word_area;
mod common_words; mod common_words;
pub use rocksdb; pub use rocksdb;
pub use self::tokenizer::Tokenizer; pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords; pub use self::common_words::CommonWords;
pub use self::attribute::{Attribute, AttributeError};
pub use self::word_area::{WordArea, WordAreaError};
/// Represent an internally generated document unique identifier. /// Represent an internally generated document unique identifier.
/// ///
@ -36,14 +32,16 @@ pub struct DocIndex {
/// The attribute in the document where the word was found /// The attribute in the document where the word was found
/// along with the index in it. /// along with the index in it.
pub attribute: Attribute, pub attribute: u16,
pub word_index: u32,
/// The position in bytes where the word was found /// The position in bytes where the word was found
/// along with the length of it. /// along with the length of it.
/// ///
/// It informs on the original word area in the text indexed /// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again. /// without needing to run the tokenizer again.
pub word_area: WordArea, pub char_index: u32,
pub char_length: u16,
} }
/// This structure represent a matching word with informations /// This structure represent a matching word with informations
@ -68,7 +66,8 @@ pub struct Match {
/// The attribute in the document where the word was found /// The attribute in the document where the word was found
/// along with the index in it. /// along with the index in it.
pub attribute: Attribute, pub attribute: u16,
pub word_index: u32,
/// Whether the word that match is an exact match or a prefix. /// Whether the word that match is an exact match or a prefix.
pub is_exact: bool, pub is_exact: bool,
@ -78,7 +77,8 @@ pub struct Match {
/// ///
/// It informs on the original word area in the text indexed /// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again. /// without needing to run the tokenizer again.
pub word_area: WordArea, pub char_index: u32,
pub char_length: u16,
} }
impl Match { impl Match {
@ -86,9 +86,11 @@ impl Match {
Match { Match {
query_index: 0, query_index: 0,
distance: 0, distance: 0,
attribute: Attribute::new_faillible(0, 0), attribute: 0,
word_index: 0,
is_exact: false, is_exact: false,
word_area: WordArea::new_faillible(0, 0), char_index: 0,
char_length: 0,
} }
} }
@ -96,9 +98,11 @@ impl Match {
Match { Match {
query_index: u32::max_value(), query_index: u32::max_value(),
distance: u8::max_value(), distance: u8::max_value(),
attribute: Attribute::max_value(), attribute: u16::max_value(),
word_index: u32::max_value(),
is_exact: true, is_exact: true,
word_area: WordArea::max_value(), char_index: u32::max_value(),
char_length: u16::max_value(),
} }
} }
} }
@ -110,6 +114,6 @@ mod tests {
#[test] #[test]
fn docindex_mem_size() { fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16); assert_eq!(mem::size_of::<DocIndex>(), 24);
} }
} }

View File

@ -1,19 +1,13 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::rank::RawDocument;
use crate::rank::Document;
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct DocumentId; pub struct DocumentId;
impl<D> Criterion<D> for DocumentId impl Criterion for DocumentId {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
lhs.id.cmp(&rhs.id) lhs.id.cmp(&rhs.id)
} }
} }

View File

@ -1,33 +1,40 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::rank::RawDocument;
use crate::Match;
#[inline] #[inline]
fn contains_exact(matches: &&[Match]) -> bool { fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
matches.iter().any(|m| m.is_exact) let mut count = 0;
let mut index = 0;
for group in query_index.linear_group_by(PartialEq::eq) {
let len = group.len();
count += is_exact[index..index + len].contains(&true) as usize;
index += len;
} }
#[inline] count
fn number_exact_matches(matches: &[Match]) -> usize {
matches.linear_group_by(match_query_index).filter(contains_exact).count()
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct Exact; pub struct Exact;
impl<D> Criterion<D> for Exact impl Criterion for Exact {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = number_exact_matches(&lhs.matches); let is_exact = lhs.is_exact();
let rhs = number_exact_matches(&rhs.matches); number_exact_matches(query_index, is_exact)
};
let rhs = {
let query_index = rhs.query_index();
let is_exact = rhs.is_exact();
number_exact_matches(query_index, is_exact)
};
lhs.cmp(&rhs).reverse() lhs.cmp(&rhs).reverse()
} }

View File

@ -8,12 +8,7 @@ mod sort_by;
mod document_id; mod document_id;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref; use crate::rank::RawDocument;
use rocksdb::DB;
use crate::database::DatabaseView;
use crate::rank::Document;
pub use self::{ pub use self::{
sum_of_typos::SumOfTypos, sum_of_typos::SumOfTypos,
@ -26,56 +21,47 @@ pub use self::{
document_id::DocumentId, document_id::DocumentId,
}; };
pub trait Criterion<D> pub trait Criterion: Send + Sync {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
{
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering;
#[inline] #[inline]
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool { fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
self.evaluate(lhs, rhs, view) == Ordering::Equal self.evaluate(lhs, rhs) == Ordering::Equal
} }
} }
impl<'a, D, T: Criterion<D> + ?Sized> Criterion<D> for &'a T impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ (**self).evaluate(lhs, rhs)
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
(**self).evaluate(lhs, rhs, view)
} }
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool { fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs, view) (**self).eq(lhs, rhs)
} }
} }
impl<D, T: Criterion<D> + ?Sized> Criterion<D> for Box<T> impl<T: Criterion + ?Sized> Criterion for Box<T> {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ (**self).evaluate(lhs, rhs)
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
(**self).evaluate(lhs, rhs, view)
} }
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool { fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs, view) (**self).eq(lhs, rhs)
} }
} }
#[derive(Default)] #[derive(Default)]
pub struct CriteriaBuilder<D> pub struct CriteriaBuilder {
where D: Deref<Target=DB> inner: Vec<Box<dyn Criterion>>
{
inner: Vec<Box<dyn Criterion<D>>>
} }
impl<D> CriteriaBuilder<D> impl CriteriaBuilder
where D: Deref<Target=DB>
{ {
pub fn new() -> CriteriaBuilder<D> { pub fn new() -> CriteriaBuilder {
CriteriaBuilder { inner: Vec::new() } CriteriaBuilder { inner: Vec::new() }
} }
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<D> { pub fn with_capacity(capacity: usize) -> CriteriaBuilder {
CriteriaBuilder { inner: Vec::with_capacity(capacity) } CriteriaBuilder { inner: Vec::with_capacity(capacity) }
} }
@ -83,33 +69,29 @@ where D: Deref<Target=DB>
self.inner.reserve(additional) self.inner.reserve(additional)
} }
pub fn add<C>(mut self, criterion: C) -> CriteriaBuilder<D> pub fn add<C>(mut self, criterion: C) -> CriteriaBuilder
where C: 'static + Criterion<D>, where C: 'static + Criterion,
{ {
self.push(criterion); self.push(criterion);
self self
} }
pub fn push<C>(&mut self, criterion: C) pub fn push<C>(&mut self, criterion: C)
where C: 'static + Criterion<D>, where C: 'static + Criterion,
{ {
self.inner.push(Box::new(criterion)); self.inner.push(Box::new(criterion));
} }
pub fn build(self) -> Criteria<D> { pub fn build(self) -> Criteria {
Criteria { inner: self.inner } Criteria { inner: self.inner }
} }
} }
pub struct Criteria<D> pub struct Criteria {
where D: Deref<Target=DB> inner: Vec<Box<dyn Criterion>>,
{
inner: Vec<Box<dyn Criterion<D>>>,
} }
impl<D> Default for Criteria<D> impl Default for Criteria {
where D: Deref<Target=DB>
{
fn default() -> Self { fn default() -> Self {
CriteriaBuilder::with_capacity(7) CriteriaBuilder::with_capacity(7)
.add(SumOfTypos) .add(SumOfTypos)
@ -123,10 +105,8 @@ where D: Deref<Target=DB>
} }
} }
impl<D> AsRef<[Box<dyn Criterion<D>>]> for Criteria<D> impl AsRef<[Box<dyn Criterion>]> for Criteria {
where D: Deref<Target=DB> fn as_ref(&self) -> &[Box<dyn Criterion>] {
{
fn as_ref(&self) -> &[Box<dyn Criterion<D>>] {
&self.inner &self.inner
} }
} }

View File

@ -1,28 +1,28 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::rank::RawDocument;
use crate::Match;
#[inline] #[inline]
fn number_of_query_words(matches: &[Match]) -> usize { fn number_of_query_words(query_index: &[u32]) -> usize {
matches.linear_group_by(match_query_index).count() query_index.linear_group_by(PartialEq::eq).count()
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct NumberOfWords; pub struct NumberOfWords;
impl<D> Criterion<D> for NumberOfWords impl Criterion for NumberOfWords {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = number_of_query_words(&lhs.matches); number_of_query_words(query_index)
let rhs = number_of_query_words(&rhs.matches); };
let rhs = {
let query_index = rhs.query_index();
number_of_query_words(query_index)
};
lhs.cmp(&rhs).reverse() lhs.cmp(&rhs).reverse()
} }

View File

@ -7,7 +7,7 @@ use serde::de::DeserializeOwned;
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::database::DatabaseView;
use crate::rank::Document; use crate::rank::RawDocument;
/// An helper struct that permit to sort documents by /// An helper struct that permit to sort documents by
/// some of their stored attributes. /// some of their stored attributes.
@ -24,7 +24,7 @@ use crate::rank::Document;
/// ///
/// # Example /// # Example
/// ///
/// ```no-test /// ```ignore
/// use serde_derive::Deserialize; /// use serde_derive::Deserialize;
/// use meilidb::rank::criterion::*; /// use meilidb::rank::criterion::*;
/// ///
@ -40,39 +40,40 @@ use crate::rank::Document;
/// .add(SumOfWordsAttribute) /// .add(SumOfWordsAttribute)
/// .add(SumOfWordsPosition) /// .add(SumOfWordsPosition)
/// .add(Exact) /// .add(Exact)
/// .add(SortBy::<TimeOnly>::new()) /// .add(SortBy::<TimeOnly>::new(&view))
/// .add(DocumentId); /// .add(DocumentId);
/// ///
/// let criterion = builder.build(); /// let criterion = builder.build();
/// ///
/// ``` /// ```
pub struct SortBy<T> { pub struct SortBy<'a, T, D>
where D: Deref<Target=DB> + Send + Sync,
T: Send + Sync
{
view: &'a DatabaseView<D>,
_phantom: marker::PhantomData<T>, _phantom: marker::PhantomData<T>,
} }
impl<T> SortBy<T> { impl<'a, T, D> SortBy<'a, T, D>
pub fn new() -> Self { where D: Deref<Target=DB> + Send + Sync,
SortBy::default() T: Send + Sync
}
}
impl<T> Default for SortBy<T> {
fn default() -> SortBy<T> {
SortBy { _phantom: marker::PhantomData }
}
}
impl<T, D> Criterion<D> for SortBy<T>
where D: Deref<Target=DB>,
T: DeserializeOwned + Ord,
{ {
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering { pub fn new(view: &'a DatabaseView<D>) -> Self {
let lhs = match view.document_by_id::<T>(lhs.id) { SortBy { view, _phantom: marker::PhantomData }
}
}
impl<'a, T, D> Criterion for SortBy<'a, T, D>
where D: Deref<Target=DB> + Send + Sync,
T: DeserializeOwned + Ord + Send + Sync,
{
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = match self.view.document_by_id::<T>(lhs.id) {
Ok(doc) => Some(doc), Ok(doc) => Some(doc),
Err(e) => { eprintln!("{}", e); None }, Err(e) => { eprintln!("{}", e); None },
}; };
let rhs = match view.document_by_id::<T>(rhs.id) { let rhs = match self.view.document_by_id::<T>(rhs.id) {
Ok(doc) => Some(doc), Ok(doc) => Some(doc),
Err(e) => { eprintln!("{}", e); None }, Err(e) => { eprintln!("{}", e); None },
}; };

View File

@ -1,24 +1,20 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::rank::RawDocument;
use crate::Match;
#[inline] #[inline]
fn sum_matches_typos(matches: &[Match]) -> isize { fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> isize {
let mut sum_typos = 0; let mut sum_typos = 0;
let mut number_words = 0; let mut number_words = 0;
let mut index = 0;
// note that GroupBy will never return an empty group for group in query_index.linear_group_by(PartialEq::eq) {
// so we can do this assumption safely sum_typos += distance[index] as isize;
for group in matches.linear_group_by(match_query_index) {
sum_typos += unsafe { group.get_unchecked(0).distance as isize };
number_words += 1; number_words += 1;
index += group.len();
} }
sum_typos - number_words sum_typos - number_words
@ -27,78 +23,42 @@ fn sum_matches_typos(matches: &[Match]) -> isize {
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct SumOfTypos; pub struct SumOfTypos;
impl<D> Criterion<D> for SumOfTypos impl Criterion for SumOfTypos {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = sum_matches_typos(&lhs.matches); let distance = lhs.distance();
let rhs = sum_matches_typos(&rhs.matches); sum_matches_typos(query_index, distance)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
sum_matches_typos(query_index, distance)
};
lhs.cmp(&rhs) lhs.cmp(&rhs)
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::{DocumentId, Attribute, WordArea};
// typing: "Geox CEO" // typing: "Geox CEO"
// //
// doc0: "Geox SpA: CEO and Executive" // doc0: "Geox SpA: CEO and Executive"
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
#[test] #[test]
fn one_typo_reference() { fn one_typo_reference() {
let doc0 = { let query_index0 = &[0, 1];
let matches = vec![ let distance0 = &[0, 0];
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 2),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(0),
matches: matches,
}
};
let doc1 = { let query_index1 = &[0, 1];
let matches = vec![ let distance1 = &[1, 0];
Match {
query_index: 0,
distance: 1,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 2),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(1),
matches: matches,
}
};
let lhs = sum_matches_typos(&doc0.matches); let lhs = sum_matches_typos(query_index0, distance0);
let rhs = sum_matches_typos(&doc1.matches); let rhs = sum_matches_typos(query_index1, distance1);
assert_eq!(lhs.cmp(&rhs), Ordering::Less); assert_eq!(lhs.cmp(&rhs), Ordering::Less);
} }
@ -108,47 +68,14 @@ mod tests {
// doc1: "bouton" // doc1: "bouton"
#[test] #[test]
fn no_typo() { fn no_typo() {
let doc0 = { let query_index0 = &[0, 1];
let matches = vec![ let distance0 = &[0, 0];
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 1),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(0),
matches: matches,
}
};
let doc1 = { let query_index1 = &[0];
let matches = vec![ let distance1 = &[0];
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(1),
matches: matches,
}
};
let lhs = sum_matches_typos(&doc0.matches); let lhs = sum_matches_typos(query_index0, distance0);
let rhs = sum_matches_typos(&doc1.matches); let rhs = sum_matches_typos(query_index1, distance1);
assert_eq!(lhs.cmp(&rhs), Ordering::Less); assert_eq!(lhs.cmp(&rhs), Ordering::Less);
} }
@ -158,47 +85,14 @@ mod tests {
// doc1: "bouton" // doc1: "bouton"
#[test] #[test]
fn one_typo() { fn one_typo() {
let doc0 = { let query_index0 = &[0, 1];
let matches = vec![ let distance0 = &[0, 1];
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 1,
attribute: Attribute::new_faillible(0, 1),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(0),
matches: matches,
}
};
let doc1 = { let query_index1 = &[0];
let matches = vec![ let distance1 = &[0];
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(1),
matches: matches,
}
};
let lhs = sum_matches_typos(&doc0.matches); let lhs = sum_matches_typos(query_index0, distance0);
let rhs = sum_matches_typos(&doc1.matches); let rhs = sum_matches_typos(query_index1, distance1);
assert_eq!(lhs.cmp(&rhs), Ordering::Equal); assert_eq!(lhs.cmp(&rhs), Ordering::Equal);
} }
} }

View File

@ -1,32 +1,39 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::database::DatabaseView;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::Match; use crate::rank::RawDocument;
#[inline] #[inline]
fn sum_matches_attributes(matches: &[Match]) -> usize { fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
// note that GroupBy will never return an empty group let mut sum_attributes = 0;
// so we can do this assumption safely let mut index = 0;
matches.linear_group_by(match_query_index).map(|group| {
unsafe { group.get_unchecked(0).attribute.attribute() as usize } for group in query_index.linear_group_by(PartialEq::eq) {
}).sum() sum_attributes += attribute[index] as usize;
index += group.len();
}
sum_attributes
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct SumOfWordsAttribute; pub struct SumOfWordsAttribute;
impl<D> Criterion<D> for SumOfWordsAttribute impl Criterion for SumOfWordsAttribute {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = sum_matches_attributes(&lhs.matches); let attribute = lhs.attribute();
let rhs = sum_matches_attributes(&rhs.matches); sum_matches_attributes(query_index, attribute)
};
let rhs = {
let query_index = rhs.query_index();
let attribute = rhs.attribute();
sum_matches_attributes(query_index, attribute)
};
lhs.cmp(&rhs) lhs.cmp(&rhs)
} }

View File

@ -1,32 +1,39 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::database::DatabaseView;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::Match; use crate::rank::RawDocument;
#[inline] #[inline]
fn sum_matches_attribute_index(matches: &[Match]) -> usize { fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize {
// note that GroupBy will never return an empty group let mut sum_word_index = 0;
// so we can do this assumption safely let mut index = 0;
matches.linear_group_by(match_query_index).map(|group| {
unsafe { group.get_unchecked(0).attribute.word_index() as usize } for group in query_index.linear_group_by(PartialEq::eq) {
}).sum() sum_word_index += word_index[index] as usize;
index += group.len();
}
sum_word_index
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct SumOfWordsPosition; pub struct SumOfWordsPosition;
impl<D> Criterion<D> for SumOfWordsPosition impl Criterion for SumOfWordsPosition {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = sum_matches_attribute_index(&lhs.matches); let word_index = lhs.word_index();
let rhs = sum_matches_attribute_index(&rhs.matches); sum_matches_attribute_index(query_index, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let word_index = rhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
lhs.cmp(&rhs) lhs.cmp(&rhs)
} }

View File

@ -1,16 +1,17 @@
use std::cmp::{self, Ordering}; use std::cmp::{self, Ordering};
use std::ops::Deref;
use rocksdb::DB;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::rank::RawDocument;
use crate::Match;
const MAX_DISTANCE: u32 = 8; const MAX_DISTANCE: u32 = 8;
#[inline]
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
(a.clone(), b.clone())
}
fn index_proximity(lhs: u32, rhs: u32) -> u32 { fn index_proximity(lhs: u32, rhs: u32) -> u32 {
if lhs < rhs { if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE) cmp::min(rhs - lhs, MAX_DISTANCE)
@ -19,30 +20,48 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
} }
} }
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 { fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 {
if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE } if lattr != rattr { return MAX_DISTANCE }
index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index()) index_proximity(lwi, rwi)
} }
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 { fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 {
let mut min_prox = u32::max_value(); let mut min_prox = u32::max_value();
for a in lhs { for a in lattr.iter().zip(lwi) {
for b in rhs { for b in rattr.iter().zip(rwi) {
let a = clone_tuple(a);
let b = clone_tuple(b);
min_prox = cmp::min(min_prox, attribute_proximity(a, b)); min_prox = cmp::min(min_prox, attribute_proximity(a, b));
} }
} }
min_prox min_prox
} }
fn matches_proximity(matches: &[Match]) -> u32 { fn matches_proximity(query_index: &[u32], attribute: &[u16], word_index: &[u32]) -> u32 {
let mut proximity = 0; let mut proximity = 0;
let mut iter = matches.linear_group_by(match_query_index);
// iterate over groups by windows of size 2 let mut index = 0;
let mut last = iter.next(); let mut iter = query_index.linear_group_by(PartialEq::eq);
let mut last = iter.next().map(|group| {
let len = group.len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
index += len;
(rattr, rwi)
});
while let (Some(lhs), Some(rhs)) = (last, iter.next()) { while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
let len = rhs.len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
let rhs = (rattr, rwi);
proximity += min_proximity(lhs, rhs); proximity += min_proximity(lhs, rhs);
last = Some(rhs); last = Some(rhs);
index += len;
} }
proximity proximity
@ -51,24 +70,30 @@ fn matches_proximity(matches: &[Match]) -> u32 {
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct WordsProximity; pub struct WordsProximity;
impl<D> Criterion<D> for WordsProximity impl Criterion for WordsProximity {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = matches_proximity(&lhs.matches); let attribute = lhs.attribute();
let rhs = matches_proximity(&rhs.matches); let word_index = lhs.word_index();
matches_proximity(query_index, attribute, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let attribute = rhs.attribute();
let word_index = rhs.word_index();
matches_proximity(query_index, attribute, word_index)
};
lhs.cmp(&rhs) lhs.cmp(&rhs)
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::Attribute;
#[test] #[test]
fn three_different_attributes() { fn three_different_attributes() {
@ -80,18 +105,14 @@ mod tests {
// { id: 2, attr: 2, attr_index: 0 } // { id: 2, attr: 2, attr_index: 0 }
// { id: 3, attr: 3, attr_index: 1 } // { id: 3, attr: 3, attr_index: 1 }
let matches = &[ let query_index = &[0, 1, 2, 2, 3];
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() }, let attribute = &[0, 1, 1, 2, 3];
Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() }, let word_index = &[0, 0, 1, 0, 1];
Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() },
];
// soup -> of = 8 // soup -> of = 8
// + of -> the = 1 // + of -> the = 1
// + the -> day = 8 (not 1) // + the -> day = 8 (not 1)
assert_eq!(matches_proximity(matches), 17); assert_eq!(matches_proximity(query_index, attribute, word_index), 17);
} }
#[test] #[test]
@ -106,57 +127,13 @@ mod tests {
// { id: 3, attr: 0, attr_index: 1 } // { id: 3, attr: 0, attr_index: 1 }
// { id: 3, attr: 1, attr_index: 3 } // { id: 3, attr: 1, attr_index: 3 }
let matches = &[ let query_index = &[0, 0, 1, 2, 3, 3];
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() }, let attribute = &[0, 1, 1, 1, 0, 1];
Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() }, let word_index = &[0, 0, 1, 2, 1, 3];
Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() },
];
// soup -> of = 1 // soup -> of = 1
// + of -> the = 1 // + of -> the = 1
// + the -> day = 1 // + the -> day = 1
assert_eq!(matches_proximity(matches), 3); assert_eq!(matches_proximity(query_index, attribute, word_index), 3);
}
}
#[cfg(all(feature = "nightly", test))]
mod bench {
extern crate test;
use super::*;
use std::error::Error;
use self::test::Bencher;
use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng};
use crate::Attribute;
#[bench]
fn evaluate_proximity(bench: &mut Bencher) -> Result<(), Box<Error>> {
let number_matches = 30_000;
let mut matches = Vec::with_capacity(number_matches);
let mut rng = XorShiftRng::seed_from_u64(42);
for _ in 0..number_matches {
let query_index = rng.gen_range(0, 4);
let attribute = rng.gen_range(0, 5);
let word_index = rng.gen_range(0, 15);
let attribute = Attribute::new_faillible(attribute, word_index);
let match_ = Match { query_index, attribute, ..Match::zero() };
matches.push(match_);
}
bench.iter(|| {
let proximity = matches_proximity(&matches);
test::black_box(move || proximity)
});
Ok(())
} }
} }

View File

@ -2,32 +2,182 @@ pub mod criterion;
mod query_builder; mod query_builder;
mod distinct_map; mod distinct_map;
use std::sync::Arc;
use slice_group_by::GroupBy;
use rayon::slice::ParallelSliceMut;
use crate::{Match, DocumentId}; use crate::{Match, DocumentId};
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
#[inline] #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
fn match_query_index(a: &Match, b: &Match) -> bool {
a.query_index == b.query_index
}
#[derive(Debug, Clone)]
pub struct Document { pub struct Document {
pub id: DocumentId, pub id: DocumentId,
pub matches: Vec<Match>, pub matches: Vec<Match>,
} }
impl Document { impl Document {
pub fn new(doc: DocumentId, match_: Match) -> Self { pub fn from_raw(raw: &RawDocument) -> Document {
unsafe { Self::from_sorted_matches(doc, vec![match_]) } let len = raw.matches.range.len();
let mut matches = Vec::with_capacity(len);
let query_index = raw.query_index();
let distance = raw.distance();
let attribute = raw.attribute();
let word_index = raw.word_index();
let is_exact = raw.is_exact();
let char_index = raw.char_index();
let char_length = raw.char_length();
for i in 0..len {
let match_ = Match {
query_index: query_index[i],
distance: distance[i],
attribute: attribute[i],
word_index: word_index[i],
is_exact: is_exact[i],
char_index: char_index[i],
char_length: char_length[i],
};
matches.push(match_);
} }
pub fn from_matches(doc: DocumentId, mut matches: Vec<Match>) -> Self { Document { id: raw.id, matches }
matches.sort_unstable(); }
unsafe { Self::from_sorted_matches(doc, matches) }
} }
pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec<Match>) -> Self { #[derive(Clone)]
Self { id, matches } pub struct RawDocument {
pub id: DocumentId,
pub matches: SharedMatches,
}
impl RawDocument {
fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
RawDocument { id, matches: SharedMatches { range, matches } }
}
pub fn query_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
}
pub fn distance(&self) -> &[u8] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
}
pub fn attribute(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
}
pub fn word_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
}
pub fn is_exact(&self) -> &[bool] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
}
pub fn char_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
}
pub fn char_length(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
}
}
pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
let mut matches2 = Matches::with_capacity(matches.len());
matches.par_sort_unstable();
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
let id = group[0].0;
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
let end = start + group.len();
docs_ranges.push((id, Range { start, end }));
matches2.extend_from_slice(group);
}
let matches = Arc::new(matches2);
docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
}
#[derive(Debug, Copy, Clone)]
struct Range {
start: usize,
end: usize,
}
impl Range {
fn len(self) -> usize {
self.end - self.start
}
}
#[derive(Clone)]
pub struct SharedMatches {
range: Range,
matches: Arc<Matches>,
}
#[derive(Clone)]
struct Matches {
query_index: Vec<u32>,
distance: Vec<u8>,
attribute: Vec<u16>,
word_index: Vec<u32>,
is_exact: Vec<bool>,
char_index: Vec<u32>,
char_length: Vec<u16>,
}
impl Matches {
fn with_capacity(cap: usize) -> Matches {
Matches {
query_index: Vec::with_capacity(cap),
distance: Vec::with_capacity(cap),
attribute: Vec::with_capacity(cap),
word_index: Vec::with_capacity(cap),
is_exact: Vec::with_capacity(cap),
char_index: Vec::with_capacity(cap),
char_length: Vec::with_capacity(cap),
}
}
fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
for (_, match_) in matches {
self.query_index.push(match_.query_index);
self.distance.push(match_.distance);
self.attribute.push(match_.attribute);
self.word_index.push(match_.word_index);
self.is_exact.push(match_.is_exact);
self.char_index.push(match_.char_index);
self.char_length.push(match_.char_length);
}
} }
} }

View File

@ -4,7 +4,9 @@ use std::error::Error;
use std::hash::Hash; use std::hash::Hash;
use std::rc::Rc; use std::rc::Rc;
use rayon::slice::ParallelSliceMut;
use slice_group_by::GroupByMut; use slice_group_by::GroupByMut;
use elapsed::measure_time;
use hashbrown::HashMap; use hashbrown::HashMap;
use fst::Streamer; use fst::Streamer;
use rocksdb::DB; use rocksdb::DB;
@ -15,7 +17,7 @@ use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
use crate::rank::criterion::Criteria; use crate::rank::criterion::Criteria;
use crate::database::DatabaseView; use crate::database::DatabaseView;
use crate::{Match, DocumentId}; use crate::{Match, DocumentId};
use crate::rank::Document; use crate::rank::{raw_documents_from_matches, RawDocument, Document};
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> { fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
@ -41,7 +43,7 @@ pub struct QueryBuilder<'a, D, FI>
where D: Deref<Target=DB> where D: Deref<Target=DB>
{ {
view: &'a DatabaseView<D>, view: &'a DatabaseView<D>,
criteria: Criteria<D>, criteria: Criteria,
filter: Option<FI>, filter: Option<FI>,
} }
@ -56,7 +58,7 @@ where D: Deref<Target=DB>
impl<'a, D, FI> QueryBuilder<'a, D, FI> impl<'a, D, FI> QueryBuilder<'a, D, FI>
where D: Deref<Target=DB>, where D: Deref<Target=DB>,
{ {
pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria<D>) -> Result<Self, Box<Error>> { pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria) -> Result<Self, Box<Error>> {
Ok(QueryBuilder { view, criteria, filter: None }) Ok(QueryBuilder { view, criteria, filter: None })
} }
@ -81,7 +83,7 @@ where D: Deref<Target=DB>,
} }
} }
fn query_all(&self, query: &str) -> Vec<Document> { fn query_all(&self, query: &str) -> Vec<RawDocument> {
let automatons = split_whitespace_automatons(query); let automatons = split_whitespace_automatons(query);
let mut stream = { let mut stream = {
@ -93,8 +95,7 @@ where D: Deref<Target=DB>,
op_builder.union() op_builder.union()
}; };
let mut number_matches = 0; let mut matches = Vec::new();
let mut matches = HashMap::new();
while let Some((input, indexed_values)) = stream.next() { while let Some((input, indexed_values)) = stream.next() {
for iv in indexed_values { for iv in indexed_values {
@ -105,24 +106,28 @@ where D: Deref<Target=DB>,
let doc_indexes = &self.view.index().positive.indexes(); let doc_indexes = &self.view.index().positive.indexes();
let doc_indexes = &doc_indexes[iv.value as usize]; let doc_indexes = &doc_indexes[iv.value as usize];
number_matches += doc_indexes.len();
for doc_index in doc_indexes { for doc_index in doc_indexes {
let match_ = Match { let match_ = Match {
query_index: iv.index as u32, query_index: iv.index as u32,
distance: distance, distance: distance,
attribute: doc_index.attribute, attribute: doc_index.attribute,
word_index: doc_index.word_index,
is_exact: is_exact, is_exact: is_exact,
word_area: doc_index.word_area, char_index: doc_index.char_index,
char_length: doc_index.char_length,
}; };
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); matches.push((doc_index.document_id, match_));
} }
} }
} }
info!("{} total documents to classify", matches.len()); let total_matches = matches.len();
info!("{} total matches to classify", number_matches); let raw_documents = raw_documents_from_matches(matches);
matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect() info!("{} total documents to classify", raw_documents.len());
info!("{} total matches to classify", total_matches);
raw_documents
} }
} }
@ -138,11 +143,10 @@ where D: Deref<Target=DB>,
return builder.query(query, range); return builder.query(query, range);
} }
let (elapsed, mut documents) = elapsed::measure_time(|| self.query_all(query)); let (elapsed, mut documents) = measure_time(|| self.query_all(query));
info!("query_all took {}", elapsed); info!("query_all took {}", elapsed);
let mut groups = vec![documents.as_mut_slice()]; let mut groups = vec![documents.as_mut_slice()];
let view = &self.view;
'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() { 'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
let tmp_groups = mem::replace(&mut groups, Vec::new()); let tmp_groups = mem::replace(&mut groups, Vec::new());
@ -159,12 +163,12 @@ where D: Deref<Target=DB>,
continue; continue;
} }
let (elapsed, ()) = elapsed::measure_time(|| { let (elapsed, _) = measure_time(|| {
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view)); group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
}); });
info!("criterion {} sort took {}", ci, elapsed); info!("criterion {} sort took {}", ci, elapsed);
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, view)) { for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
documents_seen += group.len(); documents_seen += group.len();
groups.push(group); groups.push(group);
@ -175,12 +179,9 @@ where D: Deref<Target=DB>,
} }
} }
// `drain` removes the documents efficiently using `ptr::copy`
// TODO it could be more efficient to have a custom iterator
let offset = cmp::min(documents.len(), range.start); let offset = cmp::min(documents.len(), range.start);
documents.drain(0..offset); let iter = documents.into_iter().skip(offset).take(range.len());
documents.truncate(range.len()); iter.map(|d| Document::from_raw(&d)).collect()
documents
} }
} }
@ -213,7 +214,9 @@ where D: Deref<Target=DB>,
K: Hash + Eq, K: Hash + Eq,
{ {
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> { pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
let mut documents = self.inner.query_all(query); let (elapsed, mut documents) = measure_time(|| self.inner.query_all(query));
info!("query_all took {}", elapsed);
let mut groups = vec![documents.as_mut_slice()]; let mut groups = vec![documents.as_mut_slice()];
let mut key_cache = HashMap::new(); let mut key_cache = HashMap::new();
let view = &self.inner.view; let view = &self.inner.view;
@ -225,12 +228,14 @@ where D: Deref<Target=DB>,
let mut distinct_map = DistinctMap::new(self.size); let mut distinct_map = DistinctMap::new(self.size);
let mut distinct_raw_offset = 0; let mut distinct_raw_offset = 0;
'criteria: for criterion in self.inner.criteria.as_ref() { 'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() {
let tmp_groups = mem::replace(&mut groups, Vec::new()); let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
let mut documents_seen = 0; let mut documents_seen = 0;
for group in tmp_groups { for group in tmp_groups {
info!("criterion {}, documents group of size {}", ci, group.len());
// if this group does not overlap with the requested range, // if this group does not overlap with the requested range,
// push it without sorting and splitting it // push it without sorting and splitting it
if documents_seen + group.len() < distinct_raw_offset { if documents_seen + group.len() < distinct_raw_offset {
@ -239,9 +244,12 @@ where D: Deref<Target=DB>,
continue; continue;
} }
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view)); let (elapsed, _) = measure_time(|| {
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
});
info!("criterion {} sort took {}", ci, elapsed);
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, view)) { for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
// we must compute the real distinguished len of this sub-group // we must compute the real distinguished len of this sub-group
for document in group.iter() { for document in group.iter() {
let filter_accepted = match &self.inner.filter { let filter_accepted = match &self.inner.filter {
@ -300,7 +308,7 @@ where D: Deref<Target=DB>,
}; };
if distinct_accepted && seen.len() > range.start { if distinct_accepted && seen.len() > range.start {
out_documents.push(document); out_documents.push(Document::from_raw(&document));
if out_documents.len() == range.len() { break } if out_documents.len() == range.len() { break }
} }
} }

View File

@ -1,102 +0,0 @@
use std::fmt;
/// Represent a word position in bytes along with the length of it.
///
/// It can represent words byte index to maximum 2^22 and
/// up to words of length 1024.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct WordArea(u32);
impl WordArea {
/// Construct a `WordArea` from a word position in expresed as
/// a number of characters and the length of it.
///
/// # Panics
///
/// The char index must not be greater than 2^22
/// and the length not greater than 1024.
pub(crate) fn new(char_index: u32, length: u16) -> Result<WordArea, WordAreaError> {
if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
return Err(WordAreaError::ByteIndexTooBig)
}
if length & 0b1111_1100_0000_0000 != 0 {
return Err(WordAreaError::LengthTooBig)
}
let char_index = char_index << 10;
Ok(WordArea(char_index | u32::from(length)))
}
pub(crate) fn new_faillible(char_index: u32, length: u16) -> WordArea {
match WordArea::new(char_index, length) {
Ok(word_area) => word_area,
Err(WordAreaError::ByteIndexTooBig) => {
panic!("word area byte index must not be greater than 2^22")
},
Err(WordAreaError::LengthTooBig) => {
panic!("word area length must not be greater than 1024")
},
}
}
pub(crate) fn max_value() -> WordArea {
WordArea(u32::max_value())
}
#[inline]
pub fn char_index(self) -> u32 {
self.0 >> 10
}
#[inline]
pub fn length(self) -> u16 {
(self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16
}
}
impl fmt::Debug for WordArea {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("WordArea")
.field("char_index", &self.char_index())
.field("length", &self.length())
.finish()
}
}
pub enum WordAreaError {
ByteIndexTooBig,
LengthTooBig,
}
#[cfg(test)]
mod tests {
use super::*;
use quickcheck::{quickcheck, TestResult};
quickcheck! {
fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult {
if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) {
return TestResult::discard()
}
let word_area = WordArea::new_faillible(gen_char_index, gen_length);
let valid_char_index = word_area.char_index() == gen_char_index;
let valid_length = word_area.length() == gen_length;
TestResult::from_bool(valid_char_index && valid_length)
}
fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult {
if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) {
return TestResult::discard()
}
let a = WordArea::new_faillible(gen_char_index, gen_length);
let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1);
TestResult::from_bool(a < b)
}
}
}