feat: Working on ops for Positive and Negative blobs

This commit is contained in:
Clément Renault 2018-11-08 12:05:59 +01:00
parent 34b43d4002
commit cc52d5dda5
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
18 changed files with 213 additions and 1479 deletions

View File

@ -8,6 +8,7 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
byteorder = "1.2"
lazy_static = "1.1"
sdset = "0.2"
fs2 = "0.4"
fnv = "1.0"
[dependencies.fst]

View File

@ -288,18 +288,13 @@ mod tests {
#[test]
fn single_negative_blob() {
let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 };
let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 };
let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 };
let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 };
let a = {
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
let mut builder = NegativeBlobBuilder::new(Vec::new());
builder.insert("hell", doc1);
builder.insert("hell", doc2);
builder.insert("hello", doc3);
builder.insert("wor", doc4);
builder.insert(1);
builder.insert(2);
builder.insert(3);
builder.insert(4);
Blob::Negative(builder.build().unwrap())
};
@ -371,10 +366,10 @@ mod tests {
};
let b = {
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
let mut builder = NegativeBlobBuilder::new(Vec::new());
builder.insert("hell", doc2);
builder.insert("hello", doc3);
builder.insert(2);
builder.insert(3);
Blob::Negative(builder.build().unwrap())
};
@ -410,10 +405,10 @@ mod tests {
};
let b = {
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
let mut builder = NegativeBlobBuilder::new(Vec::new());
builder.insert("hell", doc1);
builder.insert("wor", doc4);
builder.insert(1);
builder.insert(4);
Blob::Negative(builder.build().unwrap())
};
@ -428,9 +423,9 @@ mod tests {
};
let d = {
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
let mut builder = NegativeBlobBuilder::new(Vec::new());
builder.insert("hell", doc1);
builder.insert(1);
Blob::Negative(builder.build().unwrap())
};
@ -478,18 +473,18 @@ mod tests {
};
let c = {
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
let mut builder = NegativeBlobBuilder::new(Vec::new());
builder.insert("hell", doc1);
builder.insert("wor", doc4);
builder.insert(1);
builder.insert(4);
Blob::Negative(builder.build().unwrap())
};
let d = {
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
let mut builder = NegativeBlobBuilder::new(Vec::new());
builder.insert("hell", doc1);
builder.insert(1);
Blob::Negative(builder.build().unwrap())
};

View File

@ -10,13 +10,22 @@ pub use self::negative_blob::{NegativeBlob, NegativeBlobBuilder};
use fst::Map;
use crate::doc_indexes::DocIndexes;
use crate::data::DocIndexes;
pub enum Blob {
Positive(PositiveBlob),
Negative(NegativeBlob),
}
impl Blob {
pub fn sign(&self) -> Sign {
match self {
Blob::Positive(_) => Sign::Positive,
Blob::Negative(_) => Sign::Negative,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Sign {
Positive,
@ -31,26 +40,3 @@ impl Sign {
}
}
}
impl Blob {
pub fn sign(&self) -> Sign {
match self {
Blob::Positive(_) => Sign::Positive,
Blob::Negative(_) => Sign::Negative,
}
}
pub fn as_map(&self) -> &Map {
match self {
Blob::Positive(blob) => blob.as_map(),
Blob::Negative(blob) => blob.as_map(),
}
}
pub fn as_indexes(&self) -> &DocIndexes {
match self {
Blob::Positive(blob) => blob.as_indexes(),
Blob::Negative(blob) => blob.as_indexes(),
}
}
}

View File

@ -2,86 +2,61 @@ use std::error::Error;
use std::path::Path;
use std::io::Write;
use fst::{Map, MapBuilder};
use crate::DocIndex;
use crate::doc_indexes::{DocIndexes, DocIndexesBuilder};
use crate::DocumentId;
use crate::data::{DocIds, DocIdsBuilder};
pub struct NegativeBlob {
map: Map,
indexes: DocIndexes,
doc_ids: DocIds,
}
impl NegativeBlob {
pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
pub unsafe fn from_path<P>(doc_ids: P) -> Result<Self, Box<Error>>
where P: AsRef<Path>,
Q: AsRef<Path>,
{
let map = Map::from_path(map)?;
let indexes = DocIndexes::from_path(indexes)?;
Ok(NegativeBlob { map, indexes })
let doc_ids = DocIds::from_path(doc_ids)?;
Ok(NegativeBlob { doc_ids })
}
pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Ok(NegativeBlob { map, indexes })
pub fn from_bytes(doc_ids: Vec<u8>) -> Result<Self, Box<Error>> {
let doc_ids = DocIds::from_bytes(doc_ids)?;
Ok(NegativeBlob { doc_ids })
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
self.map.get(key).and_then(|index| self.indexes.get(index))
pub fn as_ids(&self) -> &DocIds {
&self.doc_ids
}
pub fn as_map(&self) -> &Map {
&self.map
}
pub fn as_indexes(&self) -> &DocIndexes {
&self.indexes
}
pub fn explode(self) -> (Map, DocIndexes) {
(self.map, self.indexes)
pub fn into_doc_ids(self) -> DocIds {
self.doc_ids
}
}
pub struct NegativeBlobBuilder<W, X> {
map: W,
indexes: DocIndexesBuilder<X>,
pub struct NegativeBlobBuilder<W> {
doc_ids: DocIdsBuilder<W>,
}
impl<W: Write, X: Write> NegativeBlobBuilder<W, X> {
pub fn new(map: W, indexes: X) -> Self {
Self { map, indexes: DocIndexesBuilder::new(indexes) }
impl<W: Write> NegativeBlobBuilder<W> {
pub fn new(wrt: W) -> Self {
Self { doc_ids: DocIdsBuilder::new(wrt) }
}
pub fn insert<S: Into<String>>(&mut self, key: S, index: DocIndex) {
self.indexes.insert(key.into(), index)
pub fn insert(&mut self, doc: DocumentId) {
self.doc_ids.insert(doc)
}
pub fn finish(self) -> Result<(), Box<Error>> {
self.into_inner().map(|_| ())
}
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
pub fn into_inner(self) -> Result<W, Box<Error>> {
// FIXME insert a magic number that indicates if the endianess
// of the input is the same as the machine that is reading it.
let map = {
let mut keys_builder = MapBuilder::new(self.map)?;
let keys = self.indexes.keys().map(|(s, v)| (s, *v));
keys_builder.extend_iter(keys)?;
keys_builder.into_inner()?
};
let indexes = self.indexes.into_inner()?;
Ok((map, indexes))
Ok(self.doc_ids.into_inner()?)
}
}
impl NegativeBlobBuilder<Vec<u8>, Vec<u8>> {
impl NegativeBlobBuilder<Vec<u8>> {
pub fn build(self) -> Result<NegativeBlob, Box<Error>> {
self.into_inner().and_then(|(m, i)| NegativeBlob::from_bytes(m, i))
self.into_inner().and_then(|ids| NegativeBlob::from_bytes(ids))
}
}

View File

@ -9,7 +9,7 @@ use crate::blob::ops_indexed_value::{
OpIndexedValueBuilder, UnionIndexedValue,
};
use crate::blob::Blob;
use crate::doc_indexes::DocIndexes;
use crate::data::DocIndexes;
use crate::vec_read_only::VecReadOnly;
use crate::DocIndex;
@ -40,23 +40,34 @@ impl<'m, A: 'm + Automaton> OpBuilder<'m, A> {
}
}
pub fn add(mut self, blob: &'m Blob) -> Self where A: Clone {
pub fn add(mut self, blob: &'m Blob) -> Self
where A: Clone
{
self.push(blob);
self
}
pub fn push(&mut self, blob: &'m Blob) where A: Clone {
let mut op = map::OpBuilder::new();
for automaton in self.automatons.iter().cloned() {
let stream = blob.as_map().search(automaton);
op.push(stream);
pub fn push(&mut self, blob: &'m Blob)
where A: Clone
{
match blob {
Blob::Positive(blob) => {
let mut op = map::OpBuilder::new();
for automaton in self.automatons.iter().cloned() {
let stream = blob.as_map().search(automaton);
op.push(stream);
}
let stream = op.union();
let indexes = blob.as_indexes();
self.maps.push(stream);
self.indexes.push(indexes);
},
Blob::Negative(blob) => {
unimplemented!()
},
}
let stream = op.union();
let indexes = blob.as_indexes();
self.maps.push(stream);
self.indexes.push(indexes);
}
pub fn union(self) -> Union<'m> {

View File

@ -5,7 +5,7 @@ use std::io::Write;
use fst::{Map, MapBuilder};
use crate::DocIndex;
use crate::doc_indexes::{DocIndexes, DocIndexesBuilder};
use crate::data::{DocIndexes, DocIndexesBuilder};
pub struct PositiveBlob {
map: Map,

72
src/data/doc_ids.rs Normal file
View File

@ -0,0 +1,72 @@
use std::collections::BTreeSet;
use std::slice::from_raw_parts;
use std::error::Error;
use std::path::Path;
use std::sync::Arc;
use std::{io, mem};
use byteorder::{NativeEndian, WriteBytesExt};
use fst::raw::MmapReadOnly;
use crate::DocumentId;
use crate::data::Data;
#[derive(Clone)]
pub struct DocIds {
doc_ids: Data,
}
impl DocIds {
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let mmap = MmapReadOnly::open_path(path)?;
let doc_ids = Data::Mmap(mmap);
Ok(DocIds { doc_ids })
}
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
let len = vec.len();
let doc_ids = Data::Shared {
vec: Arc::new(vec),
offset: 0,
len: len
};
Ok(DocIds { doc_ids })
}
pub fn contains(&self, doc: DocumentId) -> bool {
// FIXME prefer using the sdset::exponential_search function
self.doc_ids().binary_search(&doc).is_ok()
}
pub fn doc_ids(&self) -> &[DocumentId] {
let slice = &self.doc_ids;
let ptr = slice.as_ptr() as *const DocumentId;
let len = slice.len() / mem::size_of::<DocumentId>();
unsafe { from_raw_parts(ptr, len) }
}
}
pub struct DocIdsBuilder<W> {
doc_ids: BTreeSet<DocumentId>,
wrt: W,
}
impl<W: io::Write> DocIdsBuilder<W> {
pub fn new(wrt: W) -> Self {
Self {
doc_ids: BTreeSet::new(),
wrt: wrt,
}
}
pub fn insert(&mut self, doc: DocumentId) {
self.doc_ids.insert(doc);
}
pub fn into_inner(mut self) -> io::Result<W> {
for id in self.doc_ids {
self.wrt.write_u64::<NativeEndian>(id)?;
}
Ok(self.wrt)
}
}

View File

@ -5,9 +5,12 @@ use std::path::Path;
use std::ops::Deref;
use std::sync::Arc;
use std::mem;
use fst::raw::MmapReadOnly;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use crate::DocIndex;
use crate::data::Data;
#[repr(C)]
struct Range {
@ -15,33 +18,10 @@ struct Range {
end: u64,
}
#[derive(Clone)]
enum DocIndexesData {
Shared {
vec: Arc<Vec<u8>>,
offset: usize,
len: usize,
},
Mmap(MmapReadOnly),
}
impl Deref for DocIndexesData {
type Target = [u8];
fn deref(&self) -> &Self::Target {
match self {
DocIndexesData::Shared { vec, offset, len } => {
&vec[*offset..offset + len]
},
DocIndexesData::Mmap(m) => m.as_slice(),
}
}
}
#[derive(Clone)]
pub struct DocIndexes {
ranges: DocIndexesData,
indexes: DocIndexesData,
ranges: Data,
indexes: Data,
}
impl DocIndexes {
@ -52,11 +32,11 @@ impl DocIndexes {
let range_len = range_len as usize * mem::size_of::<Range>();
let offset = mem::size_of::<u64>() as usize;
let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len));
let ranges = Data::Mmap(mmap.range(offset, range_len));
let len = mmap.len() - range_len - offset;
let offset = offset + range_len;
let indexes = DocIndexesData::Mmap(mmap.range(offset, len));
let indexes = Data::Mmap(mmap.range(offset, len));
Ok(DocIndexes { ranges, indexes })
}
@ -68,7 +48,7 @@ impl DocIndexes {
let range_len = range_len as usize * mem::size_of::<Range>();
let offset = mem::size_of::<u64>() as usize;
let ranges = DocIndexesData::Shared {
let ranges = Data::Shared {
vec: vec.clone(),
offset,
len: range_len
@ -76,7 +56,7 @@ impl DocIndexes {
let len = vec.len() - range_len - offset;
let offset = offset + range_len;
let indexes = DocIndexesData::Shared { vec, offset, len };
let indexes = Data::Shared { vec, offset, len };
Ok(DocIndexes { ranges, indexes })
}

33
src/data/mod.rs Normal file
View File

@ -0,0 +1,33 @@
mod doc_ids;
mod doc_indexes;
use std::ops::Deref;
use std::sync::Arc;
use fst::raw::MmapReadOnly;
pub use self::doc_ids::{DocIds, DocIdsBuilder};
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
#[derive(Clone)]
enum Data {
Shared {
vec: Arc<Vec<u8>>,
offset: usize,
len: usize,
},
Mmap(MmapReadOnly),
}
impl Deref for Data {
type Target = [u8];
fn deref(&self) -> &Self::Target {
match self {
Data::Shared { vec, offset, len } => {
&vec[*offset..offset + len]
},
Data::Mmap(m) => m.as_slice(),
}
}
}

View File

@ -1,21 +1,37 @@
use std::path::{Path, PathBuf};
use std::error::Error;
use std::fs::{self, File};
use fs2::FileExt;
use crate::rank::Document;
use crate::blob::Blob;
pub struct Index {
path: PathBuf,
lock_file: File,
blobs: Vec<Blob>,
}
impl Index {
pub fn open(path: &Path) -> Result<Self, Box<Error>> {
unimplemented!()
pub fn open<P: Into<PathBuf>>(path: P) -> Result<Self, Box<Error>> {
let path = path.into();
let lock_file = File::create(path.join(".lock"))?;
lock_file.try_lock_exclusive()?;
let blobs = Vec::new();
Ok(Self { path, lock_file, blobs })
}
pub fn create(path: &Path) -> Result<Self, Box<Error>> {
unimplemented!()
pub fn create<P: Into<PathBuf>>(path: P) -> Result<Self, Box<Error>> {
let path = path.into();
fs::create_dir_all(&path)?;
File::create(path.join(".lock"))?;
Self::open(path)
}
pub fn blobs(&self) -> &[Blob] {

View File

@ -3,18 +3,14 @@
#[macro_use] extern crate lazy_static;
pub mod index;
pub mod pentium;
pub mod blob;
pub mod doc_indexes;
pub mod data;
pub mod rank;
pub mod metadata;
pub mod vec_read_only;
pub mod automaton;
pub mod tokenizer;
mod common_words;
pub use self::metadata::{Metadata, MetadataBuilder};
pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords;

View File

@ -1,126 +0,0 @@
use fst::{Streamer, Automaton};
use crate::metadata::ops::{self, IndexedDocIndexes};
use crate::metadata::{stream_ops, Metadata};
fn union_with_automatons<'a, A>(metas: &'a [Metadata], autos: Vec<A>) -> ops::Union
where A: 'a + Automaton + Clone,
{
let mut op = ops::OpBuilder::with_automatons(autos);
for metadata in metas {
op.push(metadata);
}
op.union()
}
pub struct Difference<'f> {
inner: stream_ops::Difference<'f>,
}
impl<'f> Difference<'f> {
pub fn new<A>(positives: &'f [Metadata], negatives: &'f [Metadata], automatons: Vec<A>) -> Self
where A: 'f + Automaton + Clone
{
let positives = union_with_automatons(positives, automatons.clone());
let negatives = union_with_automatons(negatives, automatons);
let mut builder = stream_ops::OpBuilder::new();
builder.push(positives);
builder.push(negatives);
Difference { inner: builder.difference() }
}
}
impl<'a, 'f> Streamer<'a> for Difference<'f> {
type Item = (&'a [u8], &'a [IndexedDocIndexes]);
fn next(&'a mut self) -> Option<Self::Item> {
self.inner.next()
}
}
#[cfg(test)]
mod tests {
use super::*;
use fst::automaton::AlwaysMatch;
use crate::metadata::{Metadata, MetadataBuilder};
use crate::vec_read_only::VecReadOnly;
use crate::DocIndex;
fn construct_metadata(documents: Vec<(String, DocIndex)>) -> Metadata {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
for (string, doc_index) in documents {
builder.insert(string, doc_index);
}
let (map, indexes) = builder.into_inner().unwrap();
Metadata::from_bytes(map, indexes).unwrap()
}
#[test]
fn empty() {
let positive_metas = construct_metadata(vec![
("chameau".into(), DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }),
("chameau".into(), DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }),
]);
let negative_metas = construct_metadata(vec![
("chameau".into(), DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }),
("chameau".into(), DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }),
]);
let positives = &[positive_metas];
let negatives = &[negative_metas];
let mut diff = Difference::new(positives, negatives, vec![AlwaysMatch]);
assert_eq!(diff.next(), None);
}
#[test]
fn one_positive() {
let di1 = DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 };
let di2 = DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 };
let positive_metas = construct_metadata(vec![
("chameau".into(), di1),
("chameau".into(), di2),
]);
let negative_metas = construct_metadata(vec![
("chameau".into(), di1),
]);
let positives = &[positive_metas];
let negatives = &[negative_metas];
let mut diff = Difference::new(positives, negatives, vec![AlwaysMatch]);
let idi = IndexedDocIndexes{ index: 0, doc_indexes: VecReadOnly::new(vec![di2]) };
assert_eq!(diff.next(), Some(("chameau".as_bytes(), &[idi][..])));
assert_eq!(diff.next(), None);
}
#[test]
fn more_negative_than_positive() {
let di1 = DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 };
let di2 = DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 };
let positive_metas = construct_metadata(vec![
("chameau".into(), di1),
]);
let negative_metas = construct_metadata(vec![
("chameau".into(), di1),
("chameau".into(), di2),
]);
let positives = &[positive_metas];
let negatives = &[negative_metas];
let mut diff = Difference::new(positives, negatives, vec![AlwaysMatch]);
assert_eq!(diff.next(), None);
}
}

View File

@ -1,200 +0,0 @@
use std::collections::btree_map::{BTreeMap, Iter, Entry};
use std::slice::from_raw_parts;
use std::io::{self, Write};
use std::path::Path;
use std::ops::Deref;
use std::sync::Arc;
use std::mem;
use fst::raw::MmapReadOnly;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use crate::DocIndex;
#[repr(C)]
struct Range {
start: u64,
end: u64,
}
#[derive(Clone)]
enum DocIndexesData {
Shared {
vec: Arc<Vec<u8>>,
offset: usize,
len: usize,
},
Mmap(MmapReadOnly),
}
impl Deref for DocIndexesData {
type Target = [u8];
fn deref(&self) -> &Self::Target {
match self {
DocIndexesData::Shared { vec, offset, len } => {
&vec[*offset..offset + len]
},
DocIndexesData::Mmap(m) => m.as_slice(),
}
}
}
#[derive(Clone)]
pub struct DocIndexes {
ranges: DocIndexesData,
indexes: DocIndexesData,
}
impl DocIndexes {
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let mmap = MmapReadOnly::open_path(path)?;
let range_len = mmap.as_slice().read_u64::<LittleEndian>()?;
let range_len = range_len as usize * mem::size_of::<Range>();
let offset = mem::size_of::<u64>() as usize;
let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len));
let len = mmap.len() - range_len - offset;
let offset = offset + range_len;
let indexes = DocIndexesData::Mmap(mmap.range(offset, len));
Ok(DocIndexes { ranges, indexes })
}
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
let vec = Arc::new(vec);
let range_len = vec.as_slice().read_u64::<LittleEndian>()?;
let range_len = range_len as usize * mem::size_of::<Range>();
let offset = mem::size_of::<u64>() as usize;
let ranges = DocIndexesData::Shared {
vec: vec.clone(),
offset,
len: range_len
};
let len = vec.len() - range_len - offset;
let offset = offset + range_len;
let indexes = DocIndexesData::Shared { vec, offset, len };
Ok(DocIndexes { ranges, indexes })
}
pub fn get(&self, index: u64) -> Option<&[DocIndex]> {
self.ranges().get(index as usize).map(|Range { start, end }| {
let start = *start as usize;
let end = *end as usize;
&self.indexes()[start..end]
})
}
fn ranges(&self) -> &[Range] {
let slice = &self.ranges;
let ptr = slice.as_ptr() as *const Range;
let len = slice.len() / mem::size_of::<Range>();
unsafe { from_raw_parts(ptr, len) }
}
fn indexes(&self) -> &[DocIndex] {
let slice = &self.indexes;
let ptr = slice.as_ptr() as *const DocIndex;
let len = slice.len() / mem::size_of::<DocIndex>();
unsafe { from_raw_parts(ptr, len) }
}
}
pub struct DocIndexesBuilder<W> {
keys: BTreeMap<String, u64>,
indexes: Vec<Vec<DocIndex>>,
number_docs: usize,
wtr: W,
}
impl<W: Write> DocIndexesBuilder<W> {
pub fn new(wtr: W) -> Self {
Self {
keys: BTreeMap::new(),
indexes: Vec::new(),
number_docs: 0,
wtr: wtr,
}
}
pub fn number_doc_indexes(&self) -> usize {
self.number_docs
}
pub fn insert(&mut self, key: String, value: DocIndex) {
match self.keys.entry(key) {
Entry::Vacant(e) => {
let index = self.indexes.len() as u64;
self.indexes.push(vec![value]);
e.insert(index);
},
Entry::Occupied(e) => {
let index = *e.get();
let vec = &mut self.indexes[index as usize];
vec.push(value);
},
}
self.number_docs += 1;
}
pub fn keys(&self) -> Iter<String, u64> {
self.keys.iter()
}
pub fn finish(self) -> io::Result<()> {
self.into_inner().map(|_| ())
}
pub fn into_inner(mut self) -> io::Result<W> {
for vec in &mut self.indexes {
vec.sort_unstable();
}
let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs);
let len = ranges.len() as u64;
// TODO check if this is correct
self.wtr.write_u64::<LittleEndian>(len)?;
unsafe {
// write Ranges first
let slice = into_u8_slice(ranges.as_slice());
self.wtr.write_all(slice)?;
// write Values after
let slice = into_u8_slice(values.as_slice());
self.wtr.write_all(slice)?;
}
self.wtr.flush()?;
Ok(self.wtr)
}
}
fn into_sliced_ranges<T>(vecs: Vec<Vec<T>>, number_docs: usize) -> (Vec<Range>, Vec<T>) {
let cap = vecs.len();
let mut ranges = Vec::with_capacity(cap);
let mut values = Vec::with_capacity(number_docs);
for v in &vecs {
let len = v.len() as u64;
let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0);
let range = Range { start, end: start + len };
ranges.push(range);
}
values.extend(vecs.into_iter().flatten());
(ranges, values)
}
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * mem::size_of::<T>();
from_raw_parts(ptr, len)
}

View File

@ -1,136 +0,0 @@
pub mod ops;
pub mod stream_ops;
pub mod doc_indexes;
pub mod difference;
pub mod ops_indexed_value;
use fst::{Map, MapBuilder};
use std::error::Error;
use std::path::Path;
use std::io::Write;
use crate::DocIndex;
use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
pub struct Metadata {
map: Map,
indexes: DocIndexes,
}
impl Metadata {
pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
where P: AsRef<Path>,
Q: AsRef<Path>,
{
let map = Map::from_path(map)?;
let indexes = DocIndexes::from_path(indexes)?;
Ok(Metadata { map, indexes })
}
pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Ok(Metadata { map, indexes })
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
self.map.get(key).and_then(|index| self.indexes.get(index))
}
pub fn as_map(&self) -> &Map {
&self.map
}
pub fn as_indexes(&self) -> &DocIndexes {
&self.indexes
}
pub fn explode(self) -> (Map, DocIndexes) {
(self.map, self.indexes)
}
}
pub struct MetadataBuilder<W, X> {
map: W,
indexes: DocIndexesBuilder<X>,
}
impl<W: Write, X: Write> MetadataBuilder<W, X> {
pub fn new(map: W, indexes: X) -> Self {
Self { map, indexes: DocIndexesBuilder::new(indexes) }
}
pub fn insert(&mut self, key: String, index: DocIndex) {
self.indexes.insert(key, index)
}
pub fn finish(self) -> Result<(), Box<Error>> {
self.into_inner().map(|_| ())
}
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
// FIXME insert a magic number that indicates if the endianess
// of the input is the same as the machine that is reading it.
let map = {
let mut keys_builder = MapBuilder::new(self.map)?;
let keys = self.indexes.keys().map(|(s, v)| (s, *v));
keys_builder.extend_iter(keys)?;
keys_builder.into_inner()?
};
let indexes = self.indexes.into_inner()?;
Ok((map, indexes))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_serialize_deserialize() {
let mapw = Vec::new();
let indexesw = Vec::new();
let builder = MetadataBuilder::new(mapw, indexesw);
let (map, indexes) = builder.into_inner().unwrap();
let metas = Metadata::from_bytes(map, indexes).unwrap();
assert_eq!(metas.get("chameau"), None);
}
#[test]
fn one_doc_serialize_deserialize() {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
let doc = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
builder.insert("chameau".into(), doc);
let (map, indexes) = builder.into_inner().unwrap();
let metas = Metadata::from_bytes(map, indexes).unwrap();
assert_eq!(metas.get("chameau"), Some(&[doc][..]));
}
#[test]
fn multiple_docs_serialize_deserialize() {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
builder.insert("chameau".into(), doc1);
builder.insert("chameau".into(), doc2);
let (map, indexes) = builder.into_inner().unwrap();
let metas = Metadata::from_bytes(map, indexes).unwrap();
assert_eq!(metas.get("chameau"), Some(&[doc1, doc2][..]));
}
}

View File

@ -1,329 +0,0 @@
use std::collections::BTreeMap;
use fst::{map, Streamer, Automaton};
use fst::automaton::AlwaysMatch;
use sdset::multi::OpBuilder as SdOpBuilder;
use sdset::{SetOperation, Set};
use crate::metadata::ops_indexed_value::{
OpIndexedValueBuilder, UnionIndexedValue,
};
use crate::metadata::doc_indexes::DocIndexes;
use crate::metadata::Metadata;
use crate::vec_read_only::VecReadOnly;
use crate::DocIndex;
pub struct OpBuilder<'m, A: Automaton> {
// the operation on the maps is always an union.
maps: OpIndexedValueBuilder<'m>,
automatons: Vec<A>,
indexes: Vec<&'m DocIndexes>,
}
impl<'m> OpBuilder<'m, AlwaysMatch> {
pub fn new() -> Self {
Self {
maps: OpIndexedValueBuilder::new(),
automatons: vec![AlwaysMatch],
indexes: Vec::new(),
}
}
}
/// Do a set operation on multiple maps with the same automatons.
impl<'m, A: 'm + Automaton> OpBuilder<'m, A> {
pub fn with_automatons(automatons: Vec<A>) -> Self {
Self {
maps: OpIndexedValueBuilder::new(),
automatons: automatons,
indexes: Vec::new(),
}
}
pub fn add(mut self, metadata: &'m Metadata) -> Self where A: Clone {
self.push(metadata);
self
}
pub fn push(&mut self, metadata: &'m Metadata) where A: Clone {
let mut op = map::OpBuilder::new();
for automaton in self.automatons.iter().cloned() {
let stream = metadata.as_map().search(automaton);
op.push(stream);
}
let stream = op.union();
let indexes = metadata.as_indexes();
self.maps.push(stream);
self.indexes.push(indexes);
}
pub fn union(self) -> Union<'m> {
Union::new(self.maps, self.indexes, self.automatons.len())
}
pub fn intersection(self) -> Intersection<'m> {
Intersection::new(self.maps, self.indexes, self.automatons.len())
}
pub fn difference(self) -> Difference<'m> {
Difference::new(self.maps, self.indexes, self.automatons.len())
}
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
SymmetricDifference::new(self.maps, self.indexes, self.automatons.len())
}
}
#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct IndexedDocIndexes {
pub index: usize,
pub doc_indexes: VecReadOnly<DocIndex>,
}
struct SlotIndexedDocIndexes {
index: usize,
start: usize,
len: usize,
}
macro_rules! logical_operation {
(struct $name:ident, $operation:ident) => {
pub struct $name<'m> {
maps: UnionIndexedValue<'m>,
indexes: Vec<&'m DocIndexes>,
number_automatons: usize,
outs: Vec<IndexedDocIndexes>,
}
impl<'m> $name<'m> {
fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>, number_automatons: usize) -> Self {
$name {
maps: maps.union(),
indexes: indexes,
number_automatons: number_automatons,
outs: Vec::new(),
}
}
}
impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
type Item = (&'a [u8], &'a [IndexedDocIndexes]);
fn next(&'a mut self) -> Option<Self::Item> {
match self.maps.next() {
Some((input, ivalues)) => {
self.outs.clear();
let mut builders = vec![BTreeMap::new(); self.number_automatons];
for iv in ivalues {
let builder = &mut builders[iv.aut_index];
builder.insert(iv.rdr_index, iv.value);
}
let mut doc_indexes = Vec::new();
let mut doc_indexes_slots = Vec::with_capacity(builders.len());
for (aut_index, values) in builders.into_iter().enumerate() {
let mut builder = SdOpBuilder::with_capacity(values.len());
for (rdr_index, value) in values {
let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes");
let indexes = Set::new_unchecked(indexes);
builder.push(indexes);
}
let start = doc_indexes.len();
builder.$operation().extend_vec(&mut doc_indexes);
let len = doc_indexes.len() - start;
if len != 0 {
let slot = SlotIndexedDocIndexes {
index: aut_index,
start: start,
len: len,
};
doc_indexes_slots.push(slot);
}
}
let read_only = VecReadOnly::new(doc_indexes);
self.outs.reserve(doc_indexes_slots.len());
for slot in doc_indexes_slots {
let indexes = IndexedDocIndexes {
index: slot.index,
doc_indexes: read_only.range(slot.start, slot.len),
};
self.outs.push(indexes);
}
if self.outs.is_empty() { return None }
Some((input, &self.outs))
},
None => None,
}
}
}
}}
logical_operation!(struct Union, union);
logical_operation!(struct Intersection, intersection);
logical_operation!(struct Difference, difference);
logical_operation!(struct SymmetricDifference, symmetric_difference);
#[cfg(test)]
mod tests {
use super::*;
use crate::metadata::MetadataBuilder;
fn get_exact_key<'m, I, S>(stream: I, key: &[u8]) -> Option<VecReadOnly<DocIndex>>
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
{
let mut stream = stream.into_stream();
while let Some((string, indexes)) = stream.next() {
if string == key {
return Some(indexes[0].doc_indexes.clone())
}
}
None
}
#[test]
fn union_two_metadata() {
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
let meta1 = {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
builder.insert("chameau".into(), doc1);
let (map, indexes) = builder.into_inner().unwrap();
Metadata::from_bytes(map, indexes).unwrap()
};
let meta2 = {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
builder.insert("chameau".into(), doc2);
let (map, indexes) = builder.into_inner().unwrap();
Metadata::from_bytes(map, indexes).unwrap()
};
let metas = OpBuilder::new().add(&meta1).add(&meta2).union();
let value = get_exact_key(metas, b"chameau");
assert_eq!(&*value.unwrap(), &[doc1, doc2][..]);
}
#[test]
fn intersection_two_metadata() {
let doc1 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
let meta1 = {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
builder.insert("chameau".into(), doc1);
let (map, indexes) = builder.into_inner().unwrap();
Metadata::from_bytes(map, indexes).unwrap()
};
let meta2 = {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
builder.insert("chameau".into(), doc2);
let (map, indexes) = builder.into_inner().unwrap();
Metadata::from_bytes(map, indexes).unwrap()
};
let metas = OpBuilder::new().add(&meta1).add(&meta2).intersection();
let value = get_exact_key(metas, b"chameau");
assert_eq!(&*value.unwrap(), &[doc1][..]);
}
#[test]
fn difference_two_metadata() {
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
let doc3 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
let meta1 = {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
builder.insert("chameau".into(), doc1);
builder.insert("chameau".into(), doc2);
let (map, indexes) = builder.into_inner().unwrap();
Metadata::from_bytes(map, indexes).unwrap()
};
let meta2 = {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
builder.insert("chameau".into(), doc3);
let (map, indexes) = builder.into_inner().unwrap();
Metadata::from_bytes(map, indexes).unwrap()
};
let metas = OpBuilder::new().add(&meta1).add(&meta2).difference();
let value = get_exact_key(metas, b"chameau");
assert_eq!(&*value.unwrap(), &[doc1][..]);
}
#[test]
fn symmetric_difference_two_metadata() {
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
let doc3 = DocIndex { document_id: 32, attribute: 0, attribute_index: 1 };
let doc4 = DocIndex { document_id: 34, attribute: 12, attribute_index: 1 };
let meta1 = {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
builder.insert("chameau".into(), doc1);
builder.insert("chameau".into(), doc2);
builder.insert("chameau".into(), doc3);
let (map, indexes) = builder.into_inner().unwrap();
Metadata::from_bytes(map, indexes).unwrap()
};
let meta2 = {
let mapw = Vec::new();
let indexesw = Vec::new();
let mut builder = MetadataBuilder::new(mapw, indexesw);
builder.insert("chameau".into(), doc2);
builder.insert("chameau".into(), doc3);
builder.insert("chameau".into(), doc4);
let (map, indexes) = builder.into_inner().unwrap();
Metadata::from_bytes(map, indexes).unwrap()
};
let metas = OpBuilder::new().add(&meta1).add(&meta2).symmetric_difference();
let value = get_exact_key(metas, b"chameau");
assert_eq!(&*value.unwrap(), &[doc1, doc4][..]);
}
}

View File

@ -1,203 +0,0 @@
use std::collections::BinaryHeap;
use std::rc::Rc;
use std::cmp;
use fst::raw::{self, Output};
use fst::{self, IntoStreamer, Streamer};
type BoxedStream<'f> = Box<for<'a> Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])> + 'f>;
pub struct OpIndexedValueBuilder<'f> {
streams: Vec<BoxedStream<'f>>,
}
impl<'f> OpIndexedValueBuilder<'f> {
pub fn new() -> Self {
Self { streams: Vec::new() }
}
pub fn push<I, S>(&mut self, stream: I)
where
I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [raw::IndexedValue])>,
S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])>,
{
self.streams.push(Box::new(stream.into_stream()));
}
pub fn union(self) -> UnionIndexedValue<'f> {
UnionIndexedValue {
heap: StreamIndexedValueHeap::new(self.streams),
outs: Vec::new(),
cur_slot: None,
}
}
}
pub struct UnionIndexedValue<'f> {
heap: StreamIndexedValueHeap<'f>,
outs: Vec<IndexedValue>,
cur_slot: Option<SlotIndexedValue>,
}
impl<'f> UnionIndexedValue<'f> {
pub fn len(&self) -> usize {
self.heap.num_slots()
}
}
impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> {
type Item = (&'a [u8], &'a [IndexedValue]);
fn next(&'a mut self) -> Option<Self::Item> {
if let Some(slot) = self.cur_slot.take() {
self.heap.refill(slot);
}
let slot = match self.heap.pop() {
None => return None,
Some(slot) => {
self.cur_slot = Some(slot);
self.cur_slot.as_mut().unwrap()
}
};
self.outs.clear();
self.outs.push(slot.indexed_value());
while let Some(slot2) = self.heap.pop_if_equal(slot.input()) {
self.outs.push(slot2.indexed_value());
self.heap.refill(slot2);
}
Some((slot.input(), &self.outs))
}
}
struct StreamIndexedValueHeap<'f> {
rdrs: Vec<BoxedStream<'f>>,
heap: BinaryHeap<SlotIndexedValue>,
}
impl<'f> StreamIndexedValueHeap<'f> {
fn new(streams: Vec<BoxedStream<'f>>) -> StreamIndexedValueHeap<'f> {
let mut u = StreamIndexedValueHeap {
rdrs: streams,
heap: BinaryHeap::new(),
};
for i in 0..u.rdrs.len() {
u.refill(SlotIndexedValue::new(i));
}
u
}
fn pop(&mut self) -> Option<SlotIndexedValue> {
self.heap.pop()
}
fn peek_is_duplicate(&self, key: &[u8]) -> bool {
self.heap.peek().map(|s| s.input() == key).unwrap_or(false)
}
fn pop_if_equal(&mut self, key: &[u8]) -> Option<SlotIndexedValue> {
if self.peek_is_duplicate(key) {
self.pop()
} else {
None
}
}
fn pop_if_le(&mut self, key: &[u8]) -> Option<SlotIndexedValue> {
if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) {
self.pop()
} else {
None
}
}
fn num_slots(&self) -> usize {
self.rdrs.len()
}
fn refill(&mut self, mut slot: SlotIndexedValue) {
if let Some((input, ivalues)) = self.rdrs[slot.rdr_index].next() {
slot.set_input(input);
for values in ivalues {
slot.set_aut_index(values.index);
slot.set_output(values.value);
self.heap.push(slot.clone());
}
}
}
}
#[derive(Debug, Clone)]
struct SlotIndexedValue {
rdr_index: usize,
aut_index: usize,
input: Rc<Vec<u8>>,
output: Output,
}
#[derive(Debug)]
pub struct IndexedValue {
pub rdr_index: usize,
pub aut_index: usize,
pub value: u64,
}
impl PartialEq for SlotIndexedValue {
fn eq(&self, other: &Self) -> bool {
(&self.input, self.rdr_index, self.aut_index, self.output)
.eq(&(&other.input, other.rdr_index, other.aut_index, other.output))
}
}
impl Eq for SlotIndexedValue { }
impl PartialOrd for SlotIndexedValue {
fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
(&self.input, self.rdr_index, self.aut_index, self.output)
.partial_cmp(&(&other.input, other.rdr_index, other.aut_index, other.output))
.map(|ord| ord.reverse())
}
}
impl Ord for SlotIndexedValue {
fn cmp(&self, other: &Self) -> cmp::Ordering {
self.partial_cmp(other).unwrap()
}
}
impl SlotIndexedValue {
fn new(rdr_index: usize) -> SlotIndexedValue {
SlotIndexedValue {
rdr_index: rdr_index,
aut_index: 0,
input: Rc::new(Vec::with_capacity(64)),
output: Output::zero(),
}
}
fn indexed_value(&self) -> IndexedValue {
IndexedValue {
rdr_index: self.rdr_index,
aut_index: self.aut_index,
value: self.output.value(),
}
}
fn input(&self) -> &[u8] {
&self.input
}
fn set_aut_index(&mut self, aut_index: usize) {
self.aut_index = aut_index;
}
fn set_input(&mut self, input: &[u8]) {
if *self.input != input {
let inner = Rc::make_mut(&mut self.input);
inner.clear();
inner.extend(input);
}
}
fn set_output(&mut self, output: u64) {
self.output = Output::new(output);
}
}

View File

@ -1,309 +0,0 @@
use std::rc::Rc;
use std::collections::{BinaryHeap, HashMap, BTreeMap};
use std::cmp;
use fst::{IntoStreamer, Streamer};
use sdset::multi::OpBuilder as SdOpBuilder;
use sdset::{SetOperation, Set};
use crate::metadata::ops::IndexedDocIndexes;
use crate::vec_read_only::VecReadOnly;
use crate::DocIndex;
type BoxedStream<'f> = Box<for<'a> Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])> + 'f>;
pub struct OpBuilder<'f> {
streams: Vec<BoxedStream<'f>>,
}
impl<'f> OpBuilder<'f> {
pub fn new() -> Self {
Self { streams: Vec::new() }
}
/// Push a stream of `IndexedDocIndexes`.
///
/// # Warning
///
/// You must ensure yourself that the automatons are
/// all the same in the same order for each stream you push.
pub fn push<I, S>(&mut self, stream: I)
where
I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
{
self.streams.push(Box::new(stream.into_stream()));
}
pub fn union(self) -> Union<'f> {
Union {
heap: StreamHeap::new(self.streams),
outs: Vec::new(),
cur_slot: None,
}
}
pub fn intersection(self) -> Intersection<'f> {
Intersection {
heap: StreamHeap::new(self.streams),
outs: Vec::new(),
cur_slot: None,
}
}
pub fn difference(self) -> Difference<'f> {
Difference {
heap: StreamHeap::new(self.streams),
outs: Vec::new(),
cur_slot: None,
}
}
pub fn symmetric_difference(self) -> SymmetricDifference<'f> {
SymmetricDifference {
heap: StreamHeap::new(self.streams),
outs: Vec::new(),
cur_slot: None,
}
}
}
// FIXME reuse it from metadata::ops
struct SlotIndexedDocIndexes {
aut_index: usize,
start: usize,
len: usize,
}
macro_rules! logical_operation {
(struct $name:ident, $operation:ident) => {
pub struct $name<'f> {
heap: StreamHeap<'f>,
outs: Vec<IndexedDocIndexes>,
cur_slot: Option<Slot>,
}
impl<'a, 'f> Streamer<'a> for $name<'f> {
type Item = (&'a [u8], &'a [IndexedDocIndexes]);
// The Metadata could be types as "key-values present" and "key-values possibly not present"
// in other words Metadata that "needs" to have key-values and other that doesn't needs.
//
// We could probably allow the user to define in Metadata some Document
// that needs to be deleted and only declare the DocumentId, and not every DocIndex of each words.
fn next(&'a mut self) -> Option<Self::Item> {
if let Some(slot) = self.cur_slot.take() {
self.heap.refill(slot);
}
let slot = match self.heap.pop() {
None => return None,
Some(slot) => {
self.cur_slot = Some(slot);
self.cur_slot.as_mut().unwrap()
}
};
self.outs.clear();
// retrieve all the doc_indexes of all the streams,
// store them in an HashMap which the key is
// the aut_index (associated with the state that is ignored),
// the doc_indexes must be stored in another BTreeMap which the key
// is the rdr_index.
//
// This will permit us to do set operations on readers (using the rdr_index)
// the BTreeMap will gives the rdr_index in order and the final result
// will be aggregated in a Vec of IndexedDocIndexes which the aut_index and state
// are the key of the first HashMap
// TODO use the fnv Hasher!
let mut builders = HashMap::new();
let iv = slot.indexed_value();
let builder = builders.entry(iv.index).or_insert_with(BTreeMap::new);
builder.insert(slot.rdr_index, iv.doc_indexes);
while let Some(mut slot) = self.heap.pop_if_equal(slot.input()) {
let iv = slot.indexed_value();
let builder = builders.entry(iv.index).or_insert_with(BTreeMap::new);
builder.insert(slot.rdr_index, iv.doc_indexes);
self.heap.refill(slot);
}
// now that we have accumulated all the doc_indexes like so:
// HashMap<(aut_index, state*), BtreeMap<rdr_index, doc_indexes>>
// we will be able to retrieve, for each aut_index, the doc_indexes
// that are needed to do the set operation
let mut doc_indexes = Vec::new();
let mut doc_indexes_slots = Vec::with_capacity(builders.len());
for (aut_index, values) in builders {
let sets = values.iter().map(|(_, v)| Set::new_unchecked(v.as_slice())).collect();
let builder = SdOpBuilder::from_vec(sets);
let start = doc_indexes.len();
builder.$operation().extend_vec(&mut doc_indexes);
let len = doc_indexes.len() - start;
if len == 0 { continue }
let slot = SlotIndexedDocIndexes {
aut_index: aut_index,
start: start,
len: len,
};
doc_indexes_slots.push(slot);
}
let read_only = VecReadOnly::new(doc_indexes);
self.outs.reserve(doc_indexes_slots.len());
for slot in doc_indexes_slots {
let indexes = IndexedDocIndexes {
index: slot.aut_index,
doc_indexes: read_only.range(slot.start, slot.len),
};
self.outs.push(indexes);
}
if self.outs.is_empty() { return None }
Some((slot.input(), &self.outs))
}
}
}}
logical_operation!(struct Union, union);
logical_operation!(struct Intersection, intersection);
logical_operation!(struct Difference, difference);
logical_operation!(struct SymmetricDifference, symmetric_difference);
struct StreamHeap<'f> {
rdrs: Vec<BoxedStream<'f>>,
heap: BinaryHeap<Slot>,
}
impl<'f> StreamHeap<'f> {
fn new(streams: Vec<BoxedStream<'f>>) -> StreamHeap<'f> {
let mut heap = StreamHeap {
rdrs: streams,
heap: BinaryHeap::new(),
};
for i in 0..heap.rdrs.len() {
heap.refill(Slot::new(i));
}
heap
}
fn pop(&mut self) -> Option<Slot> {
self.heap.pop()
}
fn peek_is_duplicate(&self, key: &[u8]) -> bool {
self.heap.peek().map(|s| s.input() == key).unwrap_or(false)
}
fn pop_if_equal(&mut self, key: &[u8]) -> Option<Slot> {
if self.peek_is_duplicate(key) {
self.pop()
} else {
None
}
}
fn pop_if_le(&mut self, key: &[u8]) -> Option<Slot> {
if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) {
self.pop()
} else {
None
}
}
fn num_slots(&self) -> usize {
self.rdrs.len()
}
fn refill(&mut self, mut slot: Slot) {
if let Some((input, outputs)) = self.rdrs[slot.rdr_index].next() {
slot.set_input(input);
for output in outputs {
slot.set_aut_index(output.index);
slot.set_output(output.doc_indexes.clone());
self.heap.push(slot.clone());
}
}
}
}
#[derive(Debug, Clone)]
struct Slot {
rdr_index: usize,
aut_index: usize,
input: Rc<Vec<u8>>,
output: Option<VecReadOnly<DocIndex>>,
}
impl PartialEq for Slot {
fn eq(&self, other: &Self) -> bool {
(&self.input, self.rdr_index, self.aut_index)
.eq(&(&other.input, other.rdr_index, other.aut_index))
}
}
impl Eq for Slot { }
impl PartialOrd for Slot {
fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
(&self.input, self.rdr_index, self.aut_index)
.partial_cmp(&(&other.input, other.rdr_index, other.aut_index))
.map(|ord| ord.reverse())
}
}
impl Ord for Slot {
fn cmp(&self, other: &Self) -> cmp::Ordering {
self.partial_cmp(other).unwrap()
}
}
impl Slot {
fn new(rdr_index: usize) -> Self {
Slot {
rdr_index: rdr_index,
aut_index: 0,
input: Rc::new(Vec::with_capacity(64)),
output: None,
}
}
fn indexed_value(&mut self) -> IndexedDocIndexes {
IndexedDocIndexes {
index: self.aut_index,
doc_indexes: self.output.take().unwrap(),
}
}
fn input(&self) -> &[u8] {
&self.input
}
fn set_input(&mut self, input: &[u8]) {
if *self.input != input {
let inner = Rc::make_mut(&mut self.input);
inner.clear();
inner.extend(input);
}
}
fn set_aut_index(&mut self, aut_index: usize) {
self.aut_index = aut_index;
}
fn set_output(&mut self, output: VecReadOnly<DocIndex>) {
self.output = Some(output);
}
}
#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct IndexedValueWithState {
pub index: usize,
pub value: u64,
}

View File

@ -1,28 +0,0 @@
use std::error::Error;
use crate::automaton;
use crate::rank::Document;
use crate::index::Index;
pub struct Pentium {
index: Index,
}
impl Pentium {
pub fn from_index(index: Index) -> Result<Self, Box<Error>> {
unimplemented!()
}
pub fn search(&self, query: &str) -> Vec<Document> {
let mut automatons = Vec::new();
for word in query.split_whitespace().map(str::to_lowercase) {
let dfa = automaton::build_prefix_dfa(&word);
automatons.push(dfa);
}
let stream = unimplemented!();
unimplemented!()
}
}