From 34b43d40028832ba05fe75df596bc62edc48b5dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 28 Oct 2018 14:24:04 +0100 Subject: [PATCH] test: Add some more tests --- examples/serve-console.rs | 2 +- examples/serve-http.rs | 2 +- src/blob/merge.rs | 509 ++++++++++++++++++++++++++++++++++ src/blob/mod.rs | 56 ++++ src/blob/negative_blob.rs | 87 ++++++ src/blob/ops.rs | 323 +++++++++++++++++++++ src/blob/ops_indexed_value.rs | 203 ++++++++++++++ src/blob/positive_blob.rs | 87 ++++++ src/doc_indexes.rs | 200 +++++++++++++ src/index.rs | 24 ++ src/lib.rs | 5 + src/metadata/difference.rs | 16 +- src/metadata/mod.rs | 6 +- src/metadata/ops.rs | 22 +- src/pentium.rs | 28 ++ src/rank/ranked_stream.rs | 16 +- src/vec_read_only.rs | 9 +- 17 files changed, 1561 insertions(+), 34 deletions(-) create mode 100644 src/blob/merge.rs create mode 100644 src/blob/mod.rs create mode 100644 src/blob/negative_blob.rs create mode 100644 src/blob/ops.rs create mode 100644 src/blob/ops_indexed_value.rs create mode 100644 src/blob/positive_blob.rs create mode 100644 src/doc_indexes.rs create mode 100644 src/index.rs create mode 100644 src/pentium.rs diff --git a/examples/serve-console.rs b/examples/serve-console.rs index 5196bc8b0..1836442ec 100644 --- a/examples/serve-console.rs +++ b/examples/serve-console.rs @@ -77,7 +77,7 @@ fn search(metadata: &Metadata, database: &DB, query: &str) { // "Sony" "PlayStation 4 500GB" let config = Config { - metadata: metadata, + index: unimplemented!(), automatons: automatons, criteria: criterion::default(), distinct: (distinct_by_title_first_four_chars, 1), diff --git a/examples/serve-http.rs b/examples/serve-http.rs index 3ef5244da..53f5dd456 100644 --- a/examples/serve-http.rs +++ b/examples/serve-http.rs @@ -89,7 +89,7 @@ where M: AsRef, } let config = Config { - metadata: metadata.as_ref(), + index: unimplemented!(), automatons: automatons, criteria: criterion::default(), distinct: ((), 1), diff --git a/src/blob/merge.rs b/src/blob/merge.rs new file mode 100644 index 000000000..5a9c88d6f --- /dev/null +++ b/src/blob/merge.rs @@ -0,0 +1,509 @@ +use crate::vec_read_only::VecReadOnly; +use std::collections::BinaryHeap; +use std::{mem, cmp}; +use std::rc::Rc; + +use fst::{Automaton, Streamer}; +use fst::automaton::AlwaysMatch; +use sdset::{Set, SetBuf, SetOperation}; +use sdset::duo::OpBuilder as SdOpBuilder; +use group_by::GroupBy; + +use crate::blob::{Blob, Sign}; +use crate::blob::ops::{OpBuilder, Union, IndexedDocIndexes}; +use crate::DocIndex; + +fn group_is_negative(blobs: &&[Blob]) -> bool { + blobs[0].sign() == Sign::Negative +} + +fn blob_same_sign(a: &Blob, b: &Blob) -> bool { + a.sign() == b.sign() +} + +fn sign_from_group_index(group: usize) -> Sign { + if group % 2 == 0 { + Sign::Positive + } else { + Sign::Negative + } +} + +pub struct Merge<'b> { + heap: GroupHeap<'b>, + outs: Vec, + cur_slot: Option, +} + +impl<'b> Merge<'b> { + pub fn always_match(blobs: &'b [Blob]) -> Self { + Self::with_automatons(vec![AlwaysMatch], blobs) + } +} + +impl<'b> Merge<'b> { + pub fn with_automatons(automatons: Vec, blobs: &'b [Blob]) -> Self + where A: 'b + Automaton + Clone + { + let mut groups = Vec::new(); + // We can skip blobs that are negative: they didn't remove anything at the start + for blobs in GroupBy::new(blobs, blob_same_sign).skip_while(group_is_negative) { + let mut builder = OpBuilder::with_automatons(automatons.clone()); + for blob in blobs { + builder.push(blob); + } + groups.push(builder.union()); + } + + let mut heap = GroupHeap::new(groups); + heap.refill(); + + Merge { + heap: heap, + outs: Vec::new(), + cur_slot: None, + } + } +} + +impl<'b, 'a> Streamer<'a> for Merge<'b> { + type Item = (&'a [u8], &'a [IndexedDocIndexes]); + + fn next(&'a mut self) -> Option { + self.outs.clear(); + loop { + if let Some(slot) = self.cur_slot.take() { + self.heap.refill(); + } + let slot = match self.heap.pop() { + None => return None, + Some(slot) => { + self.cur_slot = Some(slot); + self.cur_slot.as_ref().unwrap() + } + }; + + let mut doc_indexes = Vec::new(); + let mut doc_indexes_slots = Vec::with_capacity(self.heap.num_groups()); + + let len = match sign_from_group_index(slot.grp_index) { + Sign::Positive => { + doc_indexes.extend_from_slice(&slot.output); + slot.output.len() + }, + Sign::Negative => 0, + }; + + let mut slotidi = SlotIndexedDocIndexes { + index: slot.aut_index, + start: 0, + len: len, + }; + + let mut buffer = Vec::new(); + while let Some(slot2) = self.heap.pop_if_equal(slot.input()) { + if slotidi.index == slot2.aut_index { + buffer.clear(); + buffer.extend(doc_indexes.drain(slotidi.start..)); + + let a = Set::new_unchecked(&buffer); + let b = Set::new_unchecked(&slot2.output); + match sign_from_group_index(slot2.grp_index) { + Sign::Positive => { SdOpBuilder::new(a, b).union().extend_vec(&mut doc_indexes) }, + Sign::Negative => SdOpBuilder::new(a, b).difference().extend_vec(&mut doc_indexes), + } + slotidi.len = doc_indexes.len() - slotidi.start; + + } else { + if slotidi.len != 0 { + doc_indexes_slots.push(slotidi); + } + slotidi = SlotIndexedDocIndexes { + index: slot2.aut_index, + start: doc_indexes.len(), + len: slot2.output.len(), + }; + buffer.extend_from_slice(&slot2.output); + } + } + + if slotidi.len != 0 { + doc_indexes_slots.push(slotidi); + } + + let read_only = VecReadOnly::new(doc_indexes); + self.outs.reserve(doc_indexes_slots.len()); + for slot in doc_indexes_slots { + let indexes = IndexedDocIndexes { + index: slot.index, + doc_indexes: read_only.range(slot.start, slot.len), + }; + self.outs.push(indexes); + } + + if !self.outs.is_empty() { + let slot = self.cur_slot.as_ref().unwrap(); // FIXME + return Some((slot.input(), &self.outs)) + } + } + } +} + +struct SlotIndexedDocIndexes { + index: usize, + start: usize, + len: usize, +} + +#[derive(Debug, Eq, PartialEq)] +struct Slot { + grp_index: usize, + aut_index: usize, + input: Rc>, + output: VecReadOnly, +} + +impl Slot { + fn input(&self) -> &[u8] { + &self.input + } +} + +impl PartialOrd for Slot { + fn partial_cmp(&self, other: &Slot) -> Option { + (&self.input, self.aut_index, self.grp_index, &self.output) + .partial_cmp(&(&other.input, other.aut_index, other.grp_index, &other.output)) + .map(|ord| ord.reverse()) + } +} + +impl Ord for Slot { + fn cmp(&self, other: &Slot) -> cmp::Ordering { + self.partial_cmp(other).unwrap() + } +} + +struct GroupHeap<'b> { + groups: Vec>, + heap: BinaryHeap, +} + +impl<'b> GroupHeap<'b> { + fn new(groups: Vec>) -> GroupHeap<'b> { + GroupHeap { + groups: groups, + heap: BinaryHeap::new(), + } + } + + fn num_groups(&self) -> usize { + self.groups.len() + } + + fn pop(&mut self) -> Option { + self.heap.pop() + } + + fn peek_is_duplicate(&self, key: &[u8]) -> bool { + self.heap.peek().map(|s| *s.input == key).unwrap_or(false) + } + + fn pop_if_equal(&mut self, key: &[u8]) -> Option { + if self.peek_is_duplicate(key) { self.pop() } else { None } + } + + fn refill(&mut self) { + for (i, group) in self.groups.iter_mut().enumerate() { + if let Some((input, doc_indexes)) = group.next() { + let input = Rc::new(input.to_vec()); + for doc_index in doc_indexes { + let slot = Slot { + input: input.clone(), + grp_index: i, + aut_index: doc_index.index, + output: doc_index.doc_indexes.clone(), + }; + self.heap.push(slot); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::blob::{PositiveBlobBuilder, NegativeBlobBuilder}; + use crate::DocIndex; + + fn get_all<'m, I, S>(stream: I) -> Vec<(String, VecReadOnly)> + where + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>, + S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>, + { + let mut result = Vec::new(); + + let mut stream = stream.into_stream(); + while let Some((string, indexes)) = stream.next() { + let string = String::from_utf8(string.to_owned()).unwrap(); + result.push((string, indexes[0].doc_indexes.clone())) + } + + result + } + + #[test] + fn single_positive_blob() { + let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 }; + let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 }; + let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 }; + let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 }; + + let a = { + let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + builder.insert("hell", doc2); + builder.insert("hello", doc3); + builder.insert("wor", doc4); + + Blob::Positive(builder.build().unwrap()) + }; + + let blobs = &[a]; + let merge = Merge::always_match(blobs); + + let value = get_all(merge); + assert_eq!(value.len(), 3); + + assert_eq!(value[0].0, "hell"); + assert_eq!(&*value[0].1, &[doc1, doc2][..]); + + assert_eq!(value[1].0, "hello"); + assert_eq!(&*value[1].1, &[doc3][..]); + + assert_eq!(value[2].0, "wor"); + assert_eq!(&*value[2].1, &[doc4][..]); + } + + #[test] + fn single_negative_blob() { + let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 }; + let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 }; + let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 }; + let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 }; + + let a = { + let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + builder.insert("hell", doc2); + builder.insert("hello", doc3); + builder.insert("wor", doc4); + + Blob::Negative(builder.build().unwrap()) + }; + + let blobs = &[a]; + let merge = Merge::always_match(blobs); + + let value = get_all(merge); + assert_eq!(value.len(), 0); + } + + #[test] + fn two_positive_blobs() { + let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 }; + let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 }; + let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 }; + let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 }; + + let a = { + let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + builder.insert("wor", doc4); + + Blob::Positive(builder.build().unwrap()) + }; + + let b = { + let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc2); + builder.insert("hello", doc3); + + Blob::Positive(builder.build().unwrap()) + }; + + let blobs = &[a, b]; + let merge = Merge::always_match(blobs); + + let value = get_all(merge); + assert_eq!(value.len(), 3); + + assert_eq!(value[0].0, "hell"); + assert_eq!(&*value[0].1, &[doc1, doc2][..]); + + assert_eq!(value[1].0, "hello"); + assert_eq!(&*value[1].1, &[doc3][..]); + + assert_eq!(value[2].0, "wor"); + assert_eq!(&*value[2].1, &[doc4][..]); + } + + #[test] + fn one_positive_one_negative_blobs() { + let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 }; + let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 }; + let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 }; + let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 }; + + let a = { + let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + builder.insert("hell", doc2); + builder.insert("hello", doc3); + builder.insert("wor", doc4); + + Blob::Positive(builder.build().unwrap()) + }; + + let b = { + let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc2); + builder.insert("hello", doc3); + + Blob::Negative(builder.build().unwrap()) + }; + + let blobs = &[a, b]; + let merge = Merge::always_match(blobs); + + let value = get_all(merge); + assert_eq!(value.len(), 2); + + assert_eq!(value[0].0, "hell"); + assert_eq!(&*value[0].1, &[doc1][..]); + + assert_eq!(value[1].0, "wor"); + assert_eq!(&*value[1].1, &[doc4][..]); + } + + #[test] + fn alternate_positive_negative_blobs() { + let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 }; + let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 }; + let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 }; + let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 }; + + let a = { + let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + builder.insert("hell", doc2); + builder.insert("hello", doc3); + + Blob::Positive(builder.build().unwrap()) + }; + + let b = { + let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + builder.insert("wor", doc4); + + Blob::Negative(builder.build().unwrap()) + }; + + let c = { + let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + builder.insert("wor", doc4); + + Blob::Positive(builder.build().unwrap()) + }; + + let d = { + let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + + Blob::Negative(builder.build().unwrap()) + }; + + let blobs = &[a, b, c, d]; + let merge = Merge::always_match(blobs); + + let value = get_all(merge); + assert_eq!(value.len(), 3); + + assert_eq!(value[0].0, "hell"); + assert_eq!(&*value[0].1, &[doc2][..]); + + assert_eq!(value[1].0, "hello"); + assert_eq!(&*value[1].1, &[doc3][..]); + + assert_eq!(value[2].0, "wor"); + assert_eq!(&*value[2].1, &[doc4][..]); + } + + #[test] + fn alternate_multiple_positive_negative_blobs() { + let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 }; + let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 }; + let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 }; + let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 }; + + let a = { + let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + builder.insert("hell", doc2); + builder.insert("hello", doc3); + + Blob::Positive(builder.build().unwrap()) + }; + + let b = { + let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + builder.insert("wor", doc4); + + Blob::Positive(builder.build().unwrap()) + }; + + let c = { + let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + builder.insert("wor", doc4); + + Blob::Negative(builder.build().unwrap()) + }; + + let d = { + let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new()); + + builder.insert("hell", doc1); + + Blob::Negative(builder.build().unwrap()) + }; + + let blobs = &[a, b, c, d]; + let merge = Merge::always_match(blobs); + + let value = get_all(merge); + assert_eq!(value.len(), 2); + + assert_eq!(value[0].0, "hell"); + assert_eq!(&*value[0].1, &[doc2][..]); + + assert_eq!(value[1].0, "hello"); + assert_eq!(&*value[1].1, &[doc3][..]); + } +} diff --git a/src/blob/mod.rs b/src/blob/mod.rs new file mode 100644 index 000000000..00a36281f --- /dev/null +++ b/src/blob/mod.rs @@ -0,0 +1,56 @@ +mod merge; +mod ops; +mod ops_indexed_value; +mod positive_blob; +mod negative_blob; + +pub use self::merge::Merge; +pub use self::positive_blob::{PositiveBlob, PositiveBlobBuilder}; +pub use self::negative_blob::{NegativeBlob, NegativeBlobBuilder}; + +use fst::Map; + +use crate::doc_indexes::DocIndexes; + +pub enum Blob { + Positive(PositiveBlob), + Negative(NegativeBlob), +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum Sign { + Positive, + Negative, +} + +impl Sign { + pub fn alternate(self) -> Sign { + match self { + Sign::Positive => Sign::Negative, + Sign::Negative => Sign::Positive, + } + } +} + +impl Blob { + pub fn sign(&self) -> Sign { + match self { + Blob::Positive(_) => Sign::Positive, + Blob::Negative(_) => Sign::Negative, + } + } + + pub fn as_map(&self) -> &Map { + match self { + Blob::Positive(blob) => blob.as_map(), + Blob::Negative(blob) => blob.as_map(), + } + } + + pub fn as_indexes(&self) -> &DocIndexes { + match self { + Blob::Positive(blob) => blob.as_indexes(), + Blob::Negative(blob) => blob.as_indexes(), + } + } +} diff --git a/src/blob/negative_blob.rs b/src/blob/negative_blob.rs new file mode 100644 index 000000000..2ae411984 --- /dev/null +++ b/src/blob/negative_blob.rs @@ -0,0 +1,87 @@ +use std::error::Error; +use std::path::Path; +use std::io::Write; + +use fst::{Map, MapBuilder}; + +use crate::DocIndex; +use crate::doc_indexes::{DocIndexes, DocIndexesBuilder}; + +pub struct NegativeBlob { + map: Map, + indexes: DocIndexes, +} + +impl NegativeBlob { + pub unsafe fn from_paths(map: P, indexes: Q) -> Result> + where P: AsRef, + Q: AsRef, + { + let map = Map::from_path(map)?; + let indexes = DocIndexes::from_path(indexes)?; + Ok(NegativeBlob { map, indexes }) + } + + pub fn from_bytes(map: Vec, indexes: Vec) -> Result> { + let map = Map::from_bytes(map)?; + let indexes = DocIndexes::from_bytes(indexes)?; + Ok(NegativeBlob { map, indexes }) + } + + pub fn get>(&self, key: K) -> Option<&[DocIndex]> { + self.map.get(key).and_then(|index| self.indexes.get(index)) + } + + pub fn as_map(&self) -> &Map { + &self.map + } + + pub fn as_indexes(&self) -> &DocIndexes { + &self.indexes + } + + pub fn explode(self) -> (Map, DocIndexes) { + (self.map, self.indexes) + } +} + +pub struct NegativeBlobBuilder { + map: W, + indexes: DocIndexesBuilder, +} + +impl NegativeBlobBuilder { + pub fn new(map: W, indexes: X) -> Self { + Self { map, indexes: DocIndexesBuilder::new(indexes) } + } + + pub fn insert>(&mut self, key: S, index: DocIndex) { + self.indexes.insert(key.into(), index) + } + + pub fn finish(self) -> Result<(), Box> { + self.into_inner().map(|_| ()) + } + + pub fn into_inner(self) -> Result<(W, X), Box> { + // FIXME insert a magic number that indicates if the endianess + // of the input is the same as the machine that is reading it. + + let map = { + let mut keys_builder = MapBuilder::new(self.map)?; + let keys = self.indexes.keys().map(|(s, v)| (s, *v)); + keys_builder.extend_iter(keys)?; + keys_builder.into_inner()? + }; + + let indexes = self.indexes.into_inner()?; + + Ok((map, indexes)) + } +} + +impl NegativeBlobBuilder, Vec> { + pub fn build(self) -> Result> { + self.into_inner().and_then(|(m, i)| NegativeBlob::from_bytes(m, i)) + } +} diff --git a/src/blob/ops.rs b/src/blob/ops.rs new file mode 100644 index 000000000..dbd143076 --- /dev/null +++ b/src/blob/ops.rs @@ -0,0 +1,323 @@ +use std::collections::BTreeMap; + +use fst::{map, Streamer, Automaton}; +use fst::automaton::AlwaysMatch; +use sdset::multi::OpBuilder as SdOpBuilder; +use sdset::{SetOperation, Set}; + +use crate::blob::ops_indexed_value::{ + OpIndexedValueBuilder, UnionIndexedValue, +}; +use crate::blob::Blob; +use crate::doc_indexes::DocIndexes; +use crate::vec_read_only::VecReadOnly; +use crate::DocIndex; + +pub struct OpBuilder<'m, A: Automaton> { + // the operation on the maps is always an union. + maps: OpIndexedValueBuilder<'m>, + automatons: Vec, + indexes: Vec<&'m DocIndexes>, +} + +impl<'m> OpBuilder<'m, AlwaysMatch> { + pub fn new() -> Self { + Self { + maps: OpIndexedValueBuilder::new(), + automatons: vec![AlwaysMatch], + indexes: Vec::new(), + } + } +} + +/// Do a set operation on multiple maps with the same automatons. +impl<'m, A: 'm + Automaton> OpBuilder<'m, A> { + pub fn with_automatons(automatons: Vec) -> Self { + Self { + maps: OpIndexedValueBuilder::new(), + automatons: automatons, + indexes: Vec::new(), + } + } + + pub fn add(mut self, blob: &'m Blob) -> Self where A: Clone { + self.push(blob); + self + } + + pub fn push(&mut self, blob: &'m Blob) where A: Clone { + let mut op = map::OpBuilder::new(); + for automaton in self.automatons.iter().cloned() { + let stream = blob.as_map().search(automaton); + op.push(stream); + } + + let stream = op.union(); + let indexes = blob.as_indexes(); + + self.maps.push(stream); + self.indexes.push(indexes); + } + + pub fn union(self) -> Union<'m> { + Union::new(self.maps, self.indexes, self.automatons.len()) + } + + pub fn intersection(self) -> Intersection<'m> { + Intersection::new(self.maps, self.indexes, self.automatons.len()) + } + + pub fn difference(self) -> Difference<'m> { + Difference::new(self.maps, self.indexes, self.automatons.len()) + } + + pub fn symmetric_difference(self) -> SymmetricDifference<'m> { + SymmetricDifference::new(self.maps, self.indexes, self.automatons.len()) + } +} + +#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct IndexedDocIndexes { + pub index: usize, + pub doc_indexes: VecReadOnly, +} + +struct SlotIndexedDocIndexes { + index: usize, + start: usize, + len: usize, +} + +macro_rules! logical_operation { + (struct $name:ident, $operation:ident) => { + +pub struct $name<'m> { + maps: UnionIndexedValue<'m>, + indexes: Vec<&'m DocIndexes>, + number_automatons: usize, + outs: Vec, +} + +impl<'m> $name<'m> { + fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>, number_automatons: usize) -> Self { + $name { + maps: maps.union(), + indexes: indexes, + number_automatons: number_automatons, + outs: Vec::new(), + } + } +} + +impl<'m, 'a> fst::Streamer<'a> for $name<'m> { + type Item = (&'a [u8], &'a [IndexedDocIndexes]); + + fn next(&'a mut self) -> Option { + match self.maps.next() { + Some((input, ivalues)) => { + self.outs.clear(); + + let mut builders = vec![BTreeMap::new(); self.number_automatons]; + for iv in ivalues { + let builder = &mut builders[iv.aut_index]; + builder.insert(iv.rdr_index, iv.value); + } + + let mut doc_indexes = Vec::new(); + let mut doc_indexes_slots = Vec::with_capacity(builders.len()); + for (aut_index, values) in builders.into_iter().enumerate() { + let mut builder = SdOpBuilder::with_capacity(values.len()); + for (rdr_index, value) in values { + let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes"); + let indexes = Set::new_unchecked(indexes); + builder.push(indexes); + } + + let start = doc_indexes.len(); + builder.$operation().extend_vec(&mut doc_indexes); + let len = doc_indexes.len() - start; + if len != 0 { + let slot = SlotIndexedDocIndexes { + index: aut_index, + start: start, + len: len, + }; + doc_indexes_slots.push(slot); + } + } + + let read_only = VecReadOnly::new(doc_indexes); + self.outs.reserve(doc_indexes_slots.len()); + for slot in doc_indexes_slots { + let indexes = IndexedDocIndexes { + index: slot.index, + doc_indexes: read_only.range(slot.start, slot.len), + }; + self.outs.push(indexes); + } + + if self.outs.is_empty() { return None } + Some((input, &self.outs)) + }, + None => None, + } + } +} +}} + +logical_operation!(struct Union, union); +logical_operation!(struct Intersection, intersection); +logical_operation!(struct Difference, difference); +logical_operation!(struct SymmetricDifference, symmetric_difference); + +#[cfg(test)] +mod tests { + use super::*; + use crate::blob::PositiveBlobBuilder; + + fn get_exact_key<'m, I, S>(stream: I, key: &[u8]) -> Option> + where + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>, + S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>, + { + let mut stream = stream.into_stream(); + while let Some((string, indexes)) = stream.next() { + if string == key { + return Some(indexes[0].doc_indexes.clone()) + } + } + None + } + + #[test] + fn union_two_blobs() { + let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; + + let meta1 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = PositiveBlobBuilder::new(mapw, indexesw); + + builder.insert("chameau", doc1); + + Blob::Positive(builder.build().unwrap()) + }; + + let meta2 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = PositiveBlobBuilder::new(mapw, indexesw); + + builder.insert("chameau", doc2); + + Blob::Positive(builder.build().unwrap()) + }; + + let metas = OpBuilder::new().add(&meta1).add(&meta2).union(); + let value = get_exact_key(metas, b"chameau"); + + assert_eq!(&*value.unwrap(), &[doc1, doc2][..]); + } + + #[test] + fn intersection_two_blobs() { + let doc1 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; + let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; + + let meta1 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = PositiveBlobBuilder::new(mapw, indexesw); + + builder.insert("chameau", doc1); + + Blob::Positive(builder.build().unwrap()) + }; + + let meta2 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = PositiveBlobBuilder::new(mapw, indexesw); + + builder.insert("chameau", doc2); + + Blob::Positive(builder.build().unwrap()) + }; + + let metas = OpBuilder::new().add(&meta1).add(&meta2).intersection(); + let value = get_exact_key(metas, b"chameau"); + + assert_eq!(&*value.unwrap(), &[doc1][..]); + } + + #[test] + fn difference_two_blobs() { + let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; + let doc3 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; + + let meta1 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = PositiveBlobBuilder::new(mapw, indexesw); + + builder.insert("chameau", doc1); + builder.insert("chameau", doc2); + + Blob::Positive(builder.build().unwrap()) + }; + + let meta2 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = PositiveBlobBuilder::new(mapw, indexesw); + + builder.insert("chameau", doc3); + + Blob::Positive(builder.build().unwrap()) + }; + + let metas = OpBuilder::new().add(&meta1).add(&meta2).difference(); + let value = get_exact_key(metas, b"chameau"); + + assert_eq!(&*value.unwrap(), &[doc1][..]); + } + + #[test] + fn symmetric_difference_two_blobs() { + let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; + let doc3 = DocIndex { document_id: 32, attribute: 0, attribute_index: 1 }; + let doc4 = DocIndex { document_id: 34, attribute: 12, attribute_index: 1 }; + + let meta1 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = PositiveBlobBuilder::new(mapw, indexesw); + + builder.insert("chameau", doc1); + builder.insert("chameau", doc2); + builder.insert("chameau", doc3); + + Blob::Positive(builder.build().unwrap()) + }; + + let meta2 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = PositiveBlobBuilder::new(mapw, indexesw); + + builder.insert("chameau", doc2); + builder.insert("chameau", doc3); + builder.insert("chameau", doc4); + + Blob::Positive(builder.build().unwrap()) + }; + + let metas = OpBuilder::new().add(&meta1).add(&meta2).symmetric_difference(); + let value = get_exact_key(metas, b"chameau"); + + assert_eq!(&*value.unwrap(), &[doc1, doc4][..]); + } +} diff --git a/src/blob/ops_indexed_value.rs b/src/blob/ops_indexed_value.rs new file mode 100644 index 000000000..2c557f61c --- /dev/null +++ b/src/blob/ops_indexed_value.rs @@ -0,0 +1,203 @@ +use std::collections::BinaryHeap; +use std::rc::Rc; +use std::cmp; +use fst::raw::{self, Output}; +use fst::{self, IntoStreamer, Streamer}; + +type BoxedStream<'f> = Box Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])> + 'f>; + +pub struct OpIndexedValueBuilder<'f> { + streams: Vec>, +} + +impl<'f> OpIndexedValueBuilder<'f> { + pub fn new() -> Self { + Self { streams: Vec::new() } + } + + pub fn push(&mut self, stream: I) + where + I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [raw::IndexedValue])>, + S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])>, + { + self.streams.push(Box::new(stream.into_stream())); + } + + pub fn union(self) -> UnionIndexedValue<'f> { + UnionIndexedValue { + heap: StreamIndexedValueHeap::new(self.streams), + outs: Vec::new(), + cur_slot: None, + } + } +} + +pub struct UnionIndexedValue<'f> { + heap: StreamIndexedValueHeap<'f>, + outs: Vec, + cur_slot: Option, +} + +impl<'f> UnionIndexedValue<'f> { + pub fn len(&self) -> usize { + self.heap.num_slots() + } +} + +impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> { + type Item = (&'a [u8], &'a [IndexedValue]); + + fn next(&'a mut self) -> Option { + if let Some(slot) = self.cur_slot.take() { + self.heap.refill(slot); + } + let slot = match self.heap.pop() { + None => return None, + Some(slot) => { + self.cur_slot = Some(slot); + self.cur_slot.as_mut().unwrap() + } + }; + self.outs.clear(); + self.outs.push(slot.indexed_value()); + while let Some(slot2) = self.heap.pop_if_equal(slot.input()) { + self.outs.push(slot2.indexed_value()); + self.heap.refill(slot2); + } + Some((slot.input(), &self.outs)) + } +} + +struct StreamIndexedValueHeap<'f> { + rdrs: Vec>, + heap: BinaryHeap, +} + +impl<'f> StreamIndexedValueHeap<'f> { + fn new(streams: Vec>) -> StreamIndexedValueHeap<'f> { + let mut u = StreamIndexedValueHeap { + rdrs: streams, + heap: BinaryHeap::new(), + }; + for i in 0..u.rdrs.len() { + u.refill(SlotIndexedValue::new(i)); + } + u + } + + fn pop(&mut self) -> Option { + self.heap.pop() + } + + fn peek_is_duplicate(&self, key: &[u8]) -> bool { + self.heap.peek().map(|s| s.input() == key).unwrap_or(false) + } + + fn pop_if_equal(&mut self, key: &[u8]) -> Option { + if self.peek_is_duplicate(key) { + self.pop() + } else { + None + } + } + + fn pop_if_le(&mut self, key: &[u8]) -> Option { + if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) { + self.pop() + } else { + None + } + } + + fn num_slots(&self) -> usize { + self.rdrs.len() + } + + fn refill(&mut self, mut slot: SlotIndexedValue) { + if let Some((input, ivalues)) = self.rdrs[slot.rdr_index].next() { + slot.set_input(input); + for values in ivalues { + slot.set_aut_index(values.index); + slot.set_output(values.value); + self.heap.push(slot.clone()); + } + } + } +} + +#[derive(Debug, Clone)] +struct SlotIndexedValue { + rdr_index: usize, + aut_index: usize, + input: Rc>, + output: Output, +} + +#[derive(Debug)] +pub struct IndexedValue { + pub rdr_index: usize, + pub aut_index: usize, + pub value: u64, +} + +impl PartialEq for SlotIndexedValue { + fn eq(&self, other: &Self) -> bool { + (&self.input, self.rdr_index, self.aut_index, self.output) + .eq(&(&other.input, other.rdr_index, other.aut_index, other.output)) + } +} + +impl Eq for SlotIndexedValue { } + +impl PartialOrd for SlotIndexedValue { + fn partial_cmp(&self, other: &Self) -> Option { + (&self.input, self.rdr_index, self.aut_index, self.output) + .partial_cmp(&(&other.input, other.rdr_index, other.aut_index, other.output)) + .map(|ord| ord.reverse()) + } +} + +impl Ord for SlotIndexedValue { + fn cmp(&self, other: &Self) -> cmp::Ordering { + self.partial_cmp(other).unwrap() + } +} + +impl SlotIndexedValue { + fn new(rdr_index: usize) -> SlotIndexedValue { + SlotIndexedValue { + rdr_index: rdr_index, + aut_index: 0, + input: Rc::new(Vec::with_capacity(64)), + output: Output::zero(), + } + } + + fn indexed_value(&self) -> IndexedValue { + IndexedValue { + rdr_index: self.rdr_index, + aut_index: self.aut_index, + value: self.output.value(), + } + } + + fn input(&self) -> &[u8] { + &self.input + } + + fn set_aut_index(&mut self, aut_index: usize) { + self.aut_index = aut_index; + } + + fn set_input(&mut self, input: &[u8]) { + if *self.input != input { + let inner = Rc::make_mut(&mut self.input); + inner.clear(); + inner.extend(input); + } + } + + fn set_output(&mut self, output: u64) { + self.output = Output::new(output); + } +} diff --git a/src/blob/positive_blob.rs b/src/blob/positive_blob.rs new file mode 100644 index 000000000..0d0b74c59 --- /dev/null +++ b/src/blob/positive_blob.rs @@ -0,0 +1,87 @@ +use std::error::Error; +use std::path::Path; +use std::io::Write; + +use fst::{Map, MapBuilder}; + +use crate::DocIndex; +use crate::doc_indexes::{DocIndexes, DocIndexesBuilder}; + +pub struct PositiveBlob { + map: Map, + indexes: DocIndexes, +} + +impl PositiveBlob { + pub unsafe fn from_paths(map: P, indexes: Q) -> Result> + where P: AsRef, + Q: AsRef, + { + let map = Map::from_path(map)?; + let indexes = DocIndexes::from_path(indexes)?; + Ok(PositiveBlob { map, indexes }) + } + + pub fn from_bytes(map: Vec, indexes: Vec) -> Result> { + let map = Map::from_bytes(map)?; + let indexes = DocIndexes::from_bytes(indexes)?; + Ok(PositiveBlob { map, indexes }) + } + + pub fn get>(&self, key: K) -> Option<&[DocIndex]> { + self.map.get(key).and_then(|index| self.indexes.get(index)) + } + + pub fn as_map(&self) -> &Map { + &self.map + } + + pub fn as_indexes(&self) -> &DocIndexes { + &self.indexes + } + + pub fn explode(self) -> (Map, DocIndexes) { + (self.map, self.indexes) + } +} + +pub struct PositiveBlobBuilder { + map: W, + indexes: DocIndexesBuilder, +} + +impl PositiveBlobBuilder { + pub fn new(map: W, indexes: X) -> Self { + Self { map, indexes: DocIndexesBuilder::new(indexes) } + } + + pub fn insert>(&mut self, key: S, index: DocIndex) { + self.indexes.insert(key.into(), index) + } + + pub fn finish(self) -> Result<(), Box> { + self.into_inner().map(|_| ()) + } + + pub fn into_inner(self) -> Result<(W, X), Box> { + // FIXME insert a magic number that indicates if the endianess + // of the input is the same as the machine that is reading it. + + let map = { + let mut keys_builder = MapBuilder::new(self.map)?; + let keys = self.indexes.keys().map(|(s, v)| (s, *v)); + keys_builder.extend_iter(keys)?; + keys_builder.into_inner()? + }; + + let indexes = self.indexes.into_inner()?; + + Ok((map, indexes)) + } +} + +impl PositiveBlobBuilder, Vec> { + pub fn build(self) -> Result> { + self.into_inner().and_then(|(m, i)| PositiveBlob::from_bytes(m, i)) + } +} diff --git a/src/doc_indexes.rs b/src/doc_indexes.rs new file mode 100644 index 000000000..5aef15baa --- /dev/null +++ b/src/doc_indexes.rs @@ -0,0 +1,200 @@ +use std::collections::btree_map::{BTreeMap, Iter, Entry}; +use std::slice::from_raw_parts; +use std::io::{self, Write}; +use std::path::Path; +use std::ops::Deref; +use std::sync::Arc; +use std::mem; +use fst::raw::MmapReadOnly; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use crate::DocIndex; + +#[repr(C)] +struct Range { + start: u64, + end: u64, +} + +#[derive(Clone)] +enum DocIndexesData { + Shared { + vec: Arc>, + offset: usize, + len: usize, + }, + Mmap(MmapReadOnly), +} + +impl Deref for DocIndexesData { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + match self { + DocIndexesData::Shared { vec, offset, len } => { + &vec[*offset..offset + len] + }, + DocIndexesData::Mmap(m) => m.as_slice(), + } + } +} + +#[derive(Clone)] +pub struct DocIndexes { + ranges: DocIndexesData, + indexes: DocIndexesData, +} + +impl DocIndexes { + pub unsafe fn from_path>(path: P) -> io::Result { + let mmap = MmapReadOnly::open_path(path)?; + + let range_len = mmap.as_slice().read_u64::()?; + let range_len = range_len as usize * mem::size_of::(); + + let offset = mem::size_of::() as usize; + let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len)); + + let len = mmap.len() - range_len - offset; + let offset = offset + range_len; + let indexes = DocIndexesData::Mmap(mmap.range(offset, len)); + + Ok(DocIndexes { ranges, indexes }) + } + + pub fn from_bytes(vec: Vec) -> io::Result { + let vec = Arc::new(vec); + + let range_len = vec.as_slice().read_u64::()?; + let range_len = range_len as usize * mem::size_of::(); + + let offset = mem::size_of::() as usize; + let ranges = DocIndexesData::Shared { + vec: vec.clone(), + offset, + len: range_len + }; + + let len = vec.len() - range_len - offset; + let offset = offset + range_len; + let indexes = DocIndexesData::Shared { vec, offset, len }; + + Ok(DocIndexes { ranges, indexes }) + } + + pub fn get(&self, index: u64) -> Option<&[DocIndex]> { + self.ranges().get(index as usize).map(|Range { start, end }| { + let start = *start as usize; + let end = *end as usize; + &self.indexes()[start..end] + }) + } + + fn ranges(&self) -> &[Range] { + let slice = &self.ranges; + let ptr = slice.as_ptr() as *const Range; + let len = slice.len() / mem::size_of::(); + unsafe { from_raw_parts(ptr, len) } + } + + fn indexes(&self) -> &[DocIndex] { + let slice = &self.indexes; + let ptr = slice.as_ptr() as *const DocIndex; + let len = slice.len() / mem::size_of::(); + unsafe { from_raw_parts(ptr, len) } + } +} + +pub struct DocIndexesBuilder { + keys: BTreeMap, + indexes: Vec>, + number_docs: usize, + wtr: W, +} + +impl DocIndexesBuilder { + pub fn new(wtr: W) -> Self { + Self { + keys: BTreeMap::new(), + indexes: Vec::new(), + number_docs: 0, + wtr: wtr, + } + } + + pub fn number_doc_indexes(&self) -> usize { + self.number_docs + } + + pub fn insert(&mut self, key: String, value: DocIndex) { + match self.keys.entry(key) { + Entry::Vacant(e) => { + let index = self.indexes.len() as u64; + self.indexes.push(vec![value]); + e.insert(index); + }, + Entry::Occupied(e) => { + let index = *e.get(); + let vec = &mut self.indexes[index as usize]; + vec.push(value); + }, + } + self.number_docs += 1; + } + + pub fn keys(&self) -> Iter { + self.keys.iter() + } + + pub fn finish(self) -> io::Result<()> { + self.into_inner().map(|_| ()) + } + + pub fn into_inner(mut self) -> io::Result { + + for vec in &mut self.indexes { + vec.sort_unstable(); + } + + let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs); + let len = ranges.len() as u64; + + // TODO check if this is correct + self.wtr.write_u64::(len)?; + unsafe { + // write Ranges first + let slice = into_u8_slice(ranges.as_slice()); + self.wtr.write_all(slice)?; + + // write Values after + let slice = into_u8_slice(values.as_slice()); + self.wtr.write_all(slice)?; + } + + self.wtr.flush()?; + Ok(self.wtr) + } +} + +fn into_sliced_ranges(vecs: Vec>, number_docs: usize) -> (Vec, Vec) { + let cap = vecs.len(); + let mut ranges = Vec::with_capacity(cap); + let mut values = Vec::with_capacity(number_docs); + + for v in &vecs { + let len = v.len() as u64; + let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); + + let range = Range { start, end: start + len }; + ranges.push(range); + } + + values.extend(vecs.into_iter().flatten()); + + (ranges, values) +} + +unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { + let ptr = slice.as_ptr() as *const u8; + let len = slice.len() * mem::size_of::(); + from_raw_parts(ptr, len) +} diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 000000000..e3431e5fe --- /dev/null +++ b/src/index.rs @@ -0,0 +1,24 @@ +use std::path::{Path, PathBuf}; +use std::error::Error; + +use crate::rank::Document; +use crate::blob::Blob; + +pub struct Index { + path: PathBuf, + blobs: Vec, +} + +impl Index { + pub fn open(path: &Path) -> Result> { + unimplemented!() + } + + pub fn create(path: &Path) -> Result> { + unimplemented!() + } + + pub fn blobs(&self) -> &[Blob] { + &self.blobs + } +} diff --git a/src/lib.rs b/src/lib.rs index e8fb9ab98..26dc886f8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,11 @@ #[macro_use] extern crate lazy_static; +pub mod index; +pub mod pentium; +pub mod blob; +pub mod doc_indexes; + pub mod rank; pub mod metadata; pub mod vec_read_only; diff --git a/src/metadata/difference.rs b/src/metadata/difference.rs index 50c648459..6e71d57d1 100644 --- a/src/metadata/difference.rs +++ b/src/metadata/difference.rs @@ -64,13 +64,13 @@ mod tests { #[test] fn empty() { let positive_metas = construct_metadata(vec![ - ("chameau".into(), DocIndex{ document: 12, attribute: 1, attribute_index: 22 }), - ("chameau".into(), DocIndex{ document: 31, attribute: 0, attribute_index: 1 }), + ("chameau".into(), DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }), + ("chameau".into(), DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }), ]); let negative_metas = construct_metadata(vec![ - ("chameau".into(), DocIndex{ document: 12, attribute: 1, attribute_index: 22 }), - ("chameau".into(), DocIndex{ document: 31, attribute: 0, attribute_index: 1 }), + ("chameau".into(), DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }), + ("chameau".into(), DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }), ]); let positives = &[positive_metas]; @@ -82,8 +82,8 @@ mod tests { #[test] fn one_positive() { - let di1 = DocIndex{ document: 12, attribute: 1, attribute_index: 22 }; - let di2 = DocIndex{ document: 31, attribute: 0, attribute_index: 1 }; + let di1 = DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }; + let di2 = DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }; let positive_metas = construct_metadata(vec![ ("chameau".into(), di1), @@ -105,8 +105,8 @@ mod tests { #[test] fn more_negative_than_positive() { - let di1 = DocIndex{ document: 12, attribute: 1, attribute_index: 22 }; - let di2 = DocIndex{ document: 31, attribute: 0, attribute_index: 1 }; + let di1 = DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }; + let di2 = DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }; let positive_metas = construct_metadata(vec![ ("chameau".into(), di1), diff --git a/src/metadata/mod.rs b/src/metadata/mod.rs index 9e594d49b..a01d48bd7 100644 --- a/src/metadata/mod.rs +++ b/src/metadata/mod.rs @@ -107,7 +107,7 @@ mod tests { let mut builder = MetadataBuilder::new(mapw, indexesw); - let doc = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; + let doc = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; builder.insert("chameau".into(), doc); let (map, indexes) = builder.into_inner().unwrap(); @@ -123,8 +123,8 @@ mod tests { let mut builder = MetadataBuilder::new(mapw, indexesw); - let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; - let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; builder.insert("chameau".into(), doc1); builder.insert("chameau".into(), doc2); diff --git a/src/metadata/ops.rs b/src/metadata/ops.rs index 666a533b3..a0d48773b 100644 --- a/src/metadata/ops.rs +++ b/src/metadata/ops.rs @@ -189,8 +189,8 @@ mod tests { #[test] fn union_two_metadata() { - let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; - let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; let meta1 = { let mapw = Vec::new(); @@ -222,8 +222,8 @@ mod tests { #[test] fn intersection_two_metadata() { - let doc1 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; - let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + let doc1 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; + let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; let meta1 = { let mapw = Vec::new(); @@ -255,9 +255,9 @@ mod tests { #[test] fn difference_two_metadata() { - let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; - let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; - let doc3 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; + let doc3 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; let meta1 = { let mapw = Vec::new(); @@ -290,10 +290,10 @@ mod tests { #[test] fn symmetric_difference_two_metadata() { - let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; - let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; - let doc3 = DocIndex { document: 32, attribute: 0, attribute_index: 1 }; - let doc4 = DocIndex { document: 34, attribute: 12, attribute_index: 1 }; + let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 }; + let doc3 = DocIndex { document_id: 32, attribute: 0, attribute_index: 1 }; + let doc4 = DocIndex { document_id: 34, attribute: 12, attribute_index: 1 }; let meta1 = { let mapw = Vec::new(); diff --git a/src/pentium.rs b/src/pentium.rs new file mode 100644 index 000000000..c9421ca46 --- /dev/null +++ b/src/pentium.rs @@ -0,0 +1,28 @@ +use std::error::Error; + +use crate::automaton; +use crate::rank::Document; +use crate::index::Index; + +pub struct Pentium { + index: Index, +} + +impl Pentium { + pub fn from_index(index: Index) -> Result> { + unimplemented!() + } + + pub fn search(&self, query: &str) -> Vec { + + let mut automatons = Vec::new(); + for word in query.split_whitespace().map(str::to_lowercase) { + let dfa = automaton::build_prefix_dfa(&word); + automatons.push(dfa); + } + + let stream = unimplemented!(); + + unimplemented!() + } +} diff --git a/src/rank/ranked_stream.rs b/src/rank/ranked_stream.rs index d7c6c2dee..9f014b56e 100644 --- a/src/rank/ranked_stream.rs +++ b/src/rank/ranked_stream.rs @@ -9,8 +9,8 @@ use fst::Streamer; use group_by::GroupByMut; use crate::automaton::{DfaExt, AutomatonExt}; -use crate::metadata::Metadata; -use crate::metadata::ops::OpBuilder; +use crate::index::Index; +use crate::blob::{Blob, Merge}; use crate::rank::criterion::Criterion; use crate::rank::Document; use crate::{Match, DocumentId}; @@ -22,28 +22,26 @@ fn clamp_range(range: Range, big: Range) -> Range { } } -pub struct Config<'m, C, F> { - pub metadata: &'m Metadata, +pub struct Config { + pub index: Index, pub automatons: Vec, pub criteria: Vec, pub distinct: (F, usize), } pub struct RankedStream<'m, C, F> { - stream: crate::metadata::ops::Union<'m>, + stream: crate::blob::Merge<'m>, automatons: Vec>, criteria: Vec, distinct: (F, usize), } impl<'m, C, F> RankedStream<'m, C, F> { - pub fn new(config: Config<'m, C, F>) -> Self { + pub fn new(config: Config) -> Self { let automatons: Vec<_> = config.automatons.into_iter().map(Rc::new).collect(); - let mut builder = OpBuilder::with_automatons(automatons.clone()); - builder.push(config.metadata); RankedStream { - stream: builder.union(), + stream: Merge::with_automatons(automatons.clone(), unimplemented!()), automatons: automatons, criteria: config.criteria, distinct: config.distinct, diff --git a/src/vec_read_only.rs b/src/vec_read_only.rs index c0d5b6403..efb3317ee 100644 --- a/src/vec_read_only.rs +++ b/src/vec_read_only.rs @@ -1,7 +1,8 @@ use std::ops::Deref; use std::sync::Arc; +use std::fmt; -#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +#[derive(Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] pub struct VecReadOnly { inner: Arc>, offset: usize, @@ -42,3 +43,9 @@ impl Deref for VecReadOnly { self.as_slice() } } + +impl fmt::Debug for VecReadOnly { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.inner.fmt(f) + } +}