From a49a21ac15ef5ab46e62458fd6cba63fa5c016f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 9 Sep 2018 13:35:12 +0200 Subject: [PATCH] feat: Remove the State from most of the code --- Cargo.lock | 9 +- raptor-indexer/src/main.rs | 38 +- raptor-search/src/main.rs | 5 +- raptor/Cargo.toml | 1 + raptor/src/automaton.rs | 43 ++- raptor/src/lib.rs | 7 +- raptor/src/metadata.rs | 425 ----------------------- raptor/src/metadata/doc_indexes.rs | 203 +++++++++++ raptor/src/metadata/mod.rs | 138 ++++++++ raptor/src/metadata/ops.rs | 332 ++++++++++++++++++ raptor/src/metadata/ops_indexed_value.rs | 197 +++++++++++ raptor/src/rank/mod.rs | 32 +- raptor/src/vec_read_only.rs | 44 +++ 13 files changed, 983 insertions(+), 491 deletions(-) delete mode 100644 raptor/src/metadata.rs create mode 100644 raptor/src/metadata/doc_indexes.rs create mode 100644 raptor/src/metadata/mod.rs create mode 100644 raptor/src/metadata/ops.rs create mode 100644 raptor/src/metadata/ops_indexed_value.rs create mode 100644 raptor/src/vec_read_only.rs diff --git a/Cargo.lock b/Cargo.lock index 7e9ff0d16..c1c512331 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -73,7 +73,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "fst" version = "0.3.2" -source = "git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref#ca3a1ebb60a6f9123f1284de380c7a5fc05d16bb" +source = "git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref#6897dbe3b97772b7056279dd5a5d7088831b4cf0" dependencies = [ "byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -238,6 +238,7 @@ dependencies = [ "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=new-custom-fst)", "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", + "sdset 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -278,6 +279,11 @@ name = "ryu" version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "sdset" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "serde" version = "1.0.75" @@ -407,6 +413,7 @@ dependencies = [ "checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" "checksum rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "" "checksum ryu 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e7c066b8e2923f05d4718a06d2622f189ff362bc642bfade6c6629f0440f3827" +"checksum sdset 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d51ad726aa3a9c4d777b35be3a4d6d5f9d6cbc0978e81c7d690d31192f263843" "checksum serde 1.0.75 (registry+https://github.com/rust-lang/crates.io-index)" = "22d340507cea0b7e6632900a176101fea959c7065d93ba555072da90aaaafc87" "checksum serde_derive 1.0.75 (registry+https://github.com/rust-lang/crates.io-index)" = "234fc8b737737b148ccd625175fc6390f5e4dacfdaa543cb93a3430d984a9119" "checksum serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)" = "44dd2cfde475037451fa99b7e5df77aa3cfd1536575fa8e7a538ab36dcde49ae" diff --git a/raptor-indexer/src/main.rs b/raptor-indexer/src/main.rs index f99d68383..ad6304341 100644 --- a/raptor-indexer/src/main.rs +++ b/raptor-indexer/src/main.rs @@ -3,13 +3,12 @@ #[macro_use] extern crate serde_derive; -use std::path::Path; use std::collections::{HashSet, BTreeMap}; -use std::fs::{self, File}; -use std::io::{self, BufReader, BufRead}; +use std::fs::File; +use std::io::{BufReader, BufRead}; use std::iter; -use raptor::{MetadataBuilder, Metadata, DocIndex}; +use raptor::{MetadataBuilder, DocIndex}; use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions}; use serde_json::from_str; use unidecode::unidecode; @@ -21,18 +20,6 @@ struct Product { ft: String, } -fn set_readonly

(path: P, readonly: bool) -> io::Result<()> -where P: AsRef -{ - let mut perms = fs::metadata(&path)?.permissions(); - perms.set_readonly(readonly); - fs::set_permissions(&path, perms) -} - -fn is_readonly>(path: P) -> io::Result { - fs::metadata(&path).map(|m| m.permissions().readonly()) -} - fn main() { let data = File::open("products.json_lines").unwrap(); let data = BufReader::new(data); @@ -62,15 +49,6 @@ fn main() { let idx_file = format!("{}.idx", random_name); let sst_file = format!("{}.sst", random_name); - for file in &[&map_file, &idx_file, &sst_file] { - match is_readonly(file) { - Ok(true) => panic!("the {:?} file is readonly, please make it writeable", file), - Err(ref e) if e.kind() == io::ErrorKind::NotFound => (), - Err(e) => panic!("{:?}", e), - Ok(false) => (), - } - } - let env_options = EnvOptions::new(); let cf_options = ColumnFamilyOptions::new(); let mut sst_file_writer = SstFileWriter::new(env_options, cf_options); @@ -128,13 +106,5 @@ fn main() { builder.finish().unwrap(); - println!("Succesfully created files: {}, {}, {}", map_file, idx_file, sst_file); - - set_readonly(&map_file, true).unwrap(); - set_readonly(&idx_file, true).unwrap(); - set_readonly(&sst_file, true).unwrap(); - - println!("Checking the dump consistency..."); - unsafe { Metadata::from_paths(map_file, idx_file).unwrap() }; - // TODO do it better! + println!("Succesfully created {:?} dump.", random_name); } diff --git a/raptor-search/src/main.rs b/raptor-search/src/main.rs index 1262d1c62..9ab993c60 100644 --- a/raptor-search/src/main.rs +++ b/raptor-search/src/main.rs @@ -13,10 +13,7 @@ fn search(metadata: &Metadata, database: &DB, query: &str) { automatons.push(lev); } - let map = metadata.as_map(); - let indexes = metadata.as_indexes(); - - let mut stream = RankedStream::new(&map, &indexes, automatons, 20); + let mut stream = RankedStream::new(&metadata, automatons, 20); while let Some(document) = stream.next() { print!("{:?}", document.document_id); diff --git a/raptor/Cargo.toml b/raptor/Cargo.toml index c1b83a6d9..74c2b4551 100644 --- a/raptor/Cargo.toml +++ b/raptor/Cargo.toml @@ -9,6 +9,7 @@ authors = ["Kerollmops "] [dependencies] byteorder = "1.2" fnv = "1.0" +sdset = "0.2" lazy_static = "1.1" [dependencies.fst] diff --git a/raptor/src/automaton.rs b/raptor/src/automaton.rs index c91b729e2..b3815d301 100644 --- a/raptor/src/automaton.rs +++ b/raptor/src/automaton.rs @@ -16,11 +16,37 @@ pub struct DfaExt { automaton: DFA, } -impl Deref for DfaExt { - type Target = DFA; +impl Automaton for DfaExt { + type State = ::State; - fn deref(&self) -> &Self::Target { - &self.automaton + fn start(&self) -> Self::State { + self.automaton.start() + } + + fn is_match(&self, state: &Self::State) -> bool { + self.automaton.is_match(state) + } + + fn can_match(&self, state: &Self::State) -> bool { + self.automaton.can_match(state) + } + + fn will_always_match(&self, state: &Self::State) -> bool { + self.automaton.will_always_match(state) + } + + fn accept(&self, state: &Self::State, byte: u8) -> Self::State { + self.automaton.accept(state, byte) + } +} + +impl AutomatonExt for DfaExt { + fn eval>(&self, s: B) -> Distance { + self.automaton.eval(s) + } + + fn query_len(&self) -> usize { + self.query_len } } @@ -39,12 +65,15 @@ pub trait AutomatonExt: Automaton { fn query_len(&self) -> usize; } -impl AutomatonExt for DfaExt { +impl AutomatonExt for T +where T: Deref, + T::Target: AutomatonExt, +{ fn eval>(&self, s: B) -> Distance { - self.automaton.eval(s) + (**self).eval(s) } fn query_len(&self) -> usize { - self.query_len + (**self).query_len() } } diff --git a/raptor/src/lib.rs b/raptor/src/lib.rs index 269495118..2786d0ca8 100644 --- a/raptor/src/lib.rs +++ b/raptor/src/lib.rs @@ -2,13 +2,14 @@ pub mod rank; pub mod metadata; +pub mod vec_read_only; pub mod automaton; pub use self::metadata::{ Metadata, MetadataBuilder, - Stream, StreamBuilder, - Union, OpBuilder, - IndexedValues, + // Stream, StreamBuilder, + // Union, OpBuilder, + // IndexedValues, }; pub use self::rank::RankedStream; diff --git a/raptor/src/metadata.rs b/raptor/src/metadata.rs deleted file mode 100644 index 9be54e268..000000000 --- a/raptor/src/metadata.rs +++ /dev/null @@ -1,425 +0,0 @@ -use std::sync::Arc; -use std::ops::Deref; -use std::error::Error; -use std::path::Path; -use std::collections::btree_map::{Entry, BTreeMap}; -use std::slice::from_raw_parts; -use std::io::{self, Write}; -use std::mem; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use fst::{self, Map, MapBuilder, Automaton}; -use fst::raw::MmapReadOnly; -use crate::DocIndex; - -#[repr(C)] -struct Range { - start: u64, - end: u64, -} - -#[derive(Clone)] -enum DocIndexesData { - Shared { - vec: Arc>, - offset: usize, - len: usize, - }, - Mmap(MmapReadOnly), -} - -impl Deref for DocIndexesData { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - match self { - DocIndexesData::Shared { vec, offset, len } => { - &vec[*offset..offset + len] - }, - DocIndexesData::Mmap(m) => m.as_slice(), - } - } -} - -#[derive(Clone)] -pub struct DocIndexes { - ranges: DocIndexesData, - indexes: DocIndexesData, -} - -impl DocIndexes { - pub unsafe fn from_path>(path: P) -> io::Result { - let mmap = MmapReadOnly::open_path(path)?; - - let range_len = mmap.as_slice().read_u64::()?; - let range_len = range_len as usize * mem::size_of::(); - - let offset = mem::size_of::() as usize; - let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len)); - - let len = mmap.len() - range_len - offset; - let offset = offset + range_len; - let indexes = DocIndexesData::Mmap(mmap.range(offset, len)); - - Ok(DocIndexes { ranges, indexes }) - } - - pub fn from_bytes(vec: Vec) -> io::Result { - let vec = Arc::new(vec); - - let range_len = vec.as_slice().read_u64::()?; - let range_len = range_len as usize * mem::size_of::(); - - let offset = mem::size_of::() as usize; - let ranges = DocIndexesData::Shared { - vec: vec.clone(), - offset, - len: range_len - }; - - let len = vec.len() - range_len - offset; - let offset = offset + range_len; - let indexes = DocIndexesData::Shared { vec, offset, len }; - - Ok(DocIndexes { ranges, indexes }) - } - - pub fn get(&self, index: u64) -> Option<&[DocIndex]> { - self.ranges().get(index as usize).map(|Range { start, end }| { - let start = *start as usize; - let end = *end as usize; - &self.indexes()[start..end] - }) - } - - fn ranges(&self) -> &[Range] { - let slice = &self.ranges; - let ptr = slice.as_ptr() as *const Range; - let len = slice.len() / mem::size_of::(); - unsafe { from_raw_parts(ptr, len) } - } - - fn indexes(&self) -> &[DocIndex] { - let slice = &self.indexes; - let ptr = slice.as_ptr() as *const DocIndex; - let len = slice.len() / mem::size_of::(); - unsafe { from_raw_parts(ptr, len) } - } -} - -pub struct Metadata { - map: Map, - indexes: DocIndexes, -} - -impl Metadata { - pub unsafe fn from_paths(map: P, indexes: Q) -> Result> - where P: AsRef, - Q: AsRef, - { - let map = Map::from_path(map)?; - let indexes = DocIndexes::from_path(indexes)?; - Ok(Metadata::from_raw(map, indexes)) - } - - pub fn from_bytes(map: Vec, indexes: Vec) -> Result> { - let map = Map::from_bytes(map)?; - let indexes = DocIndexes::from_bytes(indexes)?; - Ok(Metadata::from_raw(map, indexes)) - } - - pub fn from_raw(map: Map, indexes: DocIndexes) -> Self { - Metadata { map, indexes } - } - - pub fn get>(&self, key: K) -> Option<&[DocIndex]> { - self.map.get(key).and_then(|index| self.indexes.get(index)) - } - - pub fn as_map(&self) -> &Map { - &self.map - } - - pub fn as_indexes(&self) -> &DocIndexes { - &self.indexes - } - - pub fn explode(self) -> (Map, DocIndexes) { - (self.map, self.indexes) - } -} - -pub struct Inner { - keys: BTreeMap, - indexes: Vec>, - number_docs: usize, -} - -impl Inner { - pub fn new() -> Self { - Inner { - keys: BTreeMap::new(), - indexes: Vec::new(), - number_docs: 0, - } - } - - pub fn number_doc_indexes(&self) -> usize { - self.number_docs - } - - pub fn insert(&mut self, key: String, value: DocIndex) { - match self.keys.entry(key) { - Entry::Vacant(e) => { - let index = self.indexes.len() as u64; - self.indexes.push(vec![value]); - e.insert(index); - }, - Entry::Occupied(e) => { - let index = *e.get(); - let vec = &mut self.indexes[index as usize]; - vec.push(value); - }, - } - self.number_docs += 1; - } -} - -pub struct MetadataBuilder { - inner: Inner, - map: W, - indexes: X, -} - -impl MetadataBuilder -{ - pub fn new(map: W, indexes: X) -> Self { - Self { inner: Inner::new(), map, indexes } - } - - pub fn insert(&mut self, key: String, index: DocIndex) { - self.inner.insert(key, index) - } - - pub fn finish(self) -> Result<(), Box> { - self.into_inner().map(|_| ()) - } - - pub fn into_inner(mut self) -> Result<(W, X), Box> { - let number_docs = self.inner.number_doc_indexes(); - - let mut keys_builder = MapBuilder::new(self.map)?; - keys_builder.extend_iter(self.inner.keys)?; - let map = keys_builder.into_inner()?; - - // write down doc_indexes into the indexes Writer - let (ranges, values) = into_sliced_ranges(self.inner.indexes, number_docs); - let len = ranges.len() as u64; - - // TODO check if this is correct - self.indexes.write_u64::(len)?; - unsafe { - // write Ranges first - let slice = into_u8_slice(ranges.as_slice()); - self.indexes.write_all(slice)?; - - // write Values after - let slice = into_u8_slice(values.as_slice()); - self.indexes.write_all(slice)?; - } - self.indexes.flush()?; - - Ok((map, self.indexes)) - } -} - -fn into_sliced_ranges(vecs: Vec>, number_docs: usize) -> (Vec, Vec) { - let cap = vecs.len(); - let mut ranges = Vec::with_capacity(cap); - let mut values = Vec::with_capacity(number_docs); - - for mut v in &vecs { - let len = v.len() as u64; - let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); - - let range = Range { start, end: start + len }; - ranges.push(range); - } - - values.extend(vecs.into_iter().flatten()); - - (ranges, values) -} - -unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { - let ptr = slice.as_ptr() as *const u8; - let len = slice.len() * mem::size_of::(); - from_raw_parts(ptr, len) -} - -pub struct OpBuilder<'m, 'v> { - inner: fst::map::OpBuilder<'m>, - indexes: &'v DocIndexes, -} - -impl<'m, 'v> OpBuilder<'m, 'v> { - pub fn new(indexes: &'v DocIndexes) -> Self { - Self { - inner: fst::map::OpBuilder::new(), - indexes: indexes, - } - } - - pub fn add(mut self, streamable: I) -> Self - where - I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, - S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, - { - self.push(streamable); - self - } - - pub fn push(&mut self, streamable: I) - where - I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, - S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, - { - self.inner.push(streamable); - } - - pub fn union(self) -> Union<'m, 'v> { - Union { - inner: self.inner.union(), - outs: Vec::new(), - indexes: self.indexes, - } - } -} - -#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] -pub struct IndexedValues<'a> { - pub index: usize, - pub values: &'a [DocIndex], -} - -pub struct Union<'m, 'v> { - inner: fst::map::Union<'m>, - outs: Vec>, - indexes: &'v DocIndexes, -} - -impl<'a, 'm, 'v> fst::Streamer<'a> for Union<'m, 'v> { - type Item = (&'a [u8], &'a [IndexedValues<'a>]); - - fn next(&'a mut self) -> Option { - match self.inner.next() { - Some((s, ivalues)) => { - self.outs.clear(); - self.outs.reserve(ivalues.len()); - for ivalue in ivalues { - if let Some(values) = self.indexes.get(ivalue.value) { - let index = ivalue.index; - self.outs.push(IndexedValues { index, values }) - } - } - Some((s, &self.outs)) - }, - None => None, - } - } -} - -pub struct StreamBuilder<'m, 'v, A> { - inner: fst::map::StreamBuilder<'m, A>, - indexes: &'v DocIndexes, -} - -impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, A> -where - A: Automaton, - A::State: Clone, -{ - type Item = >::Item; - type Into = Stream<'m, 'v, A>; - - fn into_stream(self) -> Self::Into { - Stream { - inner: self.inner.into_stream(), - indexes: self.indexes, - } - } -} - -pub struct Stream<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> { - inner: fst::map::Stream<'m, A>, - indexes: &'v DocIndexes, -} - -impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for Stream<'m, 'v, A> -where - A: Automaton, -{ - type Item = (&'a [u8], &'a [DocIndex]); - - fn next(&'a mut self) -> Option { - match self.inner.next() { - Some((key, i)) => { - match self.indexes.get(i) { - Some(values) => Some((key, values)), - None => None, - } - }, - None => None, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn empty_serialize_deserialize() { - let mapw = Vec::new(); - let indexesw = Vec::new(); - - let builder = MetadataBuilder::new(mapw, indexesw); - let (map, indexes) = builder.into_inner().unwrap(); - - let metas = Metadata::from_bytes(map, indexes).unwrap(); - assert_eq!(metas.get("chameau"), None); - } - - #[test] - fn one_doc_serialize_deserialize() { - let mapw = Vec::new(); - let indexesw = Vec::new(); - - let mut builder = MetadataBuilder::new(mapw, indexesw); - - let doc = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; - builder.insert("chameau".into(), doc); - - let (map, indexes) = builder.into_inner().unwrap(); - - let metas = Metadata::from_bytes(map, indexes).unwrap(); - assert_eq!(metas.get("chameau"), Some(&[doc][..])); - } - - #[test] - fn multiple_docs_serialize_deserialize() { - let mapw = Vec::new(); - let indexesw = Vec::new(); - - let mut builder = MetadataBuilder::new(mapw, indexesw); - - let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; - let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; - builder.insert("chameau".into(), doc1); - builder.insert("chameau".into(), doc2); - - let (map, indexes) = builder.into_inner().unwrap(); - - let metas = Metadata::from_bytes(map, indexes).unwrap(); - assert_eq!(metas.get("chameau"), Some(&[doc1, doc2][..])); - } -} diff --git a/raptor/src/metadata/doc_indexes.rs b/raptor/src/metadata/doc_indexes.rs new file mode 100644 index 000000000..8cc81ae88 --- /dev/null +++ b/raptor/src/metadata/doc_indexes.rs @@ -0,0 +1,203 @@ +use std::collections::btree_map::{BTreeMap, Iter, Entry}; +use std::slice::from_raw_parts; +use std::io::{self, Write}; +use std::path::Path; +use std::ops::Deref; +use std::sync::Arc; +use std::mem; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use fst::raw::MmapReadOnly; +use crate::DocIndex; + +#[repr(C)] +struct Range { + start: u64, + end: u64, +} + +#[derive(Clone)] +enum DocIndexesData { + Shared { + vec: Arc>, + offset: usize, + len: usize, + }, + Mmap(MmapReadOnly), +} + +impl Deref for DocIndexesData { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + match self { + DocIndexesData::Shared { vec, offset, len } => { + &vec[*offset..offset + len] + }, + DocIndexesData::Mmap(m) => m.as_slice(), + } + } +} + +#[derive(Clone)] +pub struct DocIndexes { + ranges: DocIndexesData, + indexes: DocIndexesData, +} + +impl DocIndexes { + pub unsafe fn from_path>(path: P) -> io::Result { + let mmap = MmapReadOnly::open_path(path)?; + + let range_len = mmap.as_slice().read_u64::()?; + let range_len = range_len as usize * mem::size_of::(); + + let offset = mem::size_of::() as usize; + let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len)); + + let len = mmap.len() - range_len - offset; + let offset = offset + range_len; + let indexes = DocIndexesData::Mmap(mmap.range(offset, len)); + + Ok(DocIndexes { ranges, indexes }) + } + + pub fn from_bytes(vec: Vec) -> io::Result { + let vec = Arc::new(vec); + + let range_len = vec.as_slice().read_u64::()?; + let range_len = range_len as usize * mem::size_of::(); + + let offset = mem::size_of::() as usize; + let ranges = DocIndexesData::Shared { + vec: vec.clone(), + offset, + len: range_len + }; + + let len = vec.len() - range_len - offset; + let offset = offset + range_len; + let indexes = DocIndexesData::Shared { vec, offset, len }; + + Ok(DocIndexes { ranges, indexes }) + } + + pub fn get(&self, index: u64) -> Option<&[DocIndex]> { + self.ranges().get(index as usize).map(|Range { start, end }| { + let start = *start as usize; + let end = *end as usize; + &self.indexes()[start..end] + }) + } + + fn ranges(&self) -> &[Range] { + let slice = &self.ranges; + let ptr = slice.as_ptr() as *const Range; + let len = slice.len() / mem::size_of::(); + unsafe { from_raw_parts(ptr, len) } + } + + fn indexes(&self) -> &[DocIndex] { + let slice = &self.indexes; + let ptr = slice.as_ptr() as *const DocIndex; + let len = slice.len() / mem::size_of::(); + unsafe { from_raw_parts(ptr, len) } + } +} + +pub struct DocIndexesBuilder { + keys: BTreeMap, + indexes: Vec>, + number_docs: usize, + wtr: W, +} + +impl DocIndexesBuilder { + pub fn new(wtr: W) -> Self { + Self { + keys: BTreeMap::new(), + indexes: Vec::new(), + number_docs: 0, + wtr: wtr, + } + } + + pub fn number_doc_indexes(&self) -> usize { + self.number_docs + } + + pub fn insert(&mut self, key: String, value: DocIndex) { + match self.keys.entry(key) { + Entry::Vacant(e) => { + let index = self.indexes.len() as u64; + self.indexes.push(vec![value]); + e.insert(index); + }, + Entry::Occupied(e) => { + let index = *e.get(); + let vec = &mut self.indexes[index as usize]; + vec.push(value); + }, + } + self.number_docs += 1; + } + + pub fn keys(&self) -> Iter { + self.keys.iter() + } + + pub fn finish(self) -> io::Result<()> { + self.into_inner().map(|_| ()) + } + + pub fn into_inner(mut self) -> io::Result { + + for vec in &mut self.indexes { + vec.sort_unstable(); + } + + let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs); + let len = ranges.len() as u64; + + // TODO check if this is correct + self.wtr.write_u64::(len)?; + unsafe { + // write Ranges first + let slice = into_u8_slice(ranges.as_slice()); + self.wtr.write_all(slice)?; + + // write Values after + let slice = into_u8_slice(values.as_slice()); + self.wtr.write_all(slice)?; + } + + self.wtr.flush()?; + Ok(self.wtr) + } +} + +fn into_sliced_ranges(vecs: Vec>, number_docs: usize) -> (Vec, Vec) { + let cap = vecs.len(); + let mut ranges = Vec::with_capacity(cap); + let mut values = Vec::with_capacity(number_docs); + + // @Improvement: remove bounds duplications: the left bound of a range + // is already the right bound of the previous range, + // we could use a slice window of size 2. + for v in &vecs { + let len = v.len() as u64; + let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); + + let range = Range { start, end: start + len }; + ranges.push(range); + } + + values.extend(vecs.into_iter().flatten()); + + (ranges, values) +} + +unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { + let ptr = slice.as_ptr() as *const u8; + let len = slice.len() * mem::size_of::(); + from_raw_parts(ptr, len) +} diff --git a/raptor/src/metadata/mod.rs b/raptor/src/metadata/mod.rs new file mode 100644 index 000000000..eba764b3d --- /dev/null +++ b/raptor/src/metadata/mod.rs @@ -0,0 +1,138 @@ +// pub mod difference; +// pub mod stream_ops; +mod ops_indexed_value; +pub mod ops; +pub mod doc_indexes; + +use fst::{Map, MapBuilder}; +use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; +use std::error::Error; +use std::path::Path; +use std::io::Write; +use crate::DocIndex; + +pub struct Metadata { + map: Map, + indexes: DocIndexes, +} + +impl Metadata { + pub unsafe fn from_paths(map: P, indexes: Q) -> Result> + where P: AsRef, + Q: AsRef, + { + let map = Map::from_path(map)?; + let indexes = DocIndexes::from_path(indexes)?; + Ok(Metadata { map, indexes }) + } + + pub fn from_bytes(map: Vec, indexes: Vec) -> Result> { + let map = Map::from_bytes(map)?; + let indexes = DocIndexes::from_bytes(indexes)?; + Ok(Metadata { map, indexes }) + } + + pub fn get>(&self, key: K) -> Option<&[DocIndex]> { + self.map.get(key).and_then(|index| self.indexes.get(index)) + } + + pub fn as_map(&self) -> &Map { + &self.map + } + + pub fn as_indexes(&self) -> &DocIndexes { + &self.indexes + } + + pub fn explode(self) -> (Map, DocIndexes) { + (self.map, self.indexes) + } +} + +pub struct MetadataBuilder { + map: W, + indexes: DocIndexesBuilder, +} + +impl MetadataBuilder { + pub fn new(map: W, indexes: X) -> Self { + Self { map, indexes: DocIndexesBuilder::new(indexes) } + } + + pub fn insert(&mut self, key: String, index: DocIndex) { + self.indexes.insert(key, index) + } + + pub fn finish(self) -> Result<(), Box> { + self.into_inner().map(|_| ()) + } + + pub fn into_inner(self) -> Result<(W, X), Box> { + // FIXME insert a magic number that indicates if the endianess + // of the input is the same as the machine that is reading it. + + let map = { + let mut keys_builder = MapBuilder::new(self.map)?; + let keys = self.indexes.keys().map(|(s, v)| (s, *v)); + keys_builder.extend_iter(keys)?; + keys_builder.into_inner()? + }; + + let indexes = self.indexes.into_inner()?; + + Ok((map, indexes)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::vec_read_only::VecReadOnly; + use crate::metadata::ops::IndexedDocIndexes; + + #[test] + fn empty_serialize_deserialize() { + let mapw = Vec::new(); + let indexesw = Vec::new(); + + let builder = MetadataBuilder::new(mapw, indexesw); + let (map, indexes) = builder.into_inner().unwrap(); + + let metas = Metadata::from_bytes(map, indexes).unwrap(); + assert_eq!(metas.get("chameau"), None); + } + + #[test] + fn one_doc_serialize_deserialize() { + let mapw = Vec::new(); + let indexesw = Vec::new(); + + let mut builder = MetadataBuilder::new(mapw, indexesw); + + let doc = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; + builder.insert("chameau".into(), doc); + + let (map, indexes) = builder.into_inner().unwrap(); + + let metas = Metadata::from_bytes(map, indexes).unwrap(); + assert_eq!(metas.get("chameau"), Some(&[doc][..])); + } + + #[test] + fn multiple_docs_serialize_deserialize() { + let mapw = Vec::new(); + let indexesw = Vec::new(); + + let mut builder = MetadataBuilder::new(mapw, indexesw); + + let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + builder.insert("chameau".into(), doc1); + builder.insert("chameau".into(), doc2); + + let (map, indexes) = builder.into_inner().unwrap(); + + let metas = Metadata::from_bytes(map, indexes).unwrap(); + assert_eq!(metas.get("chameau"), Some(&[doc1, doc2][..])); + } +} diff --git a/raptor/src/metadata/ops.rs b/raptor/src/metadata/ops.rs new file mode 100644 index 000000000..448279e94 --- /dev/null +++ b/raptor/src/metadata/ops.rs @@ -0,0 +1,332 @@ +use std::hash::{Hash, Hasher}; +use std::collections::{HashMap, BTreeMap}; +use fst::{map, Streamer, Automaton}; +use fst::automaton::AlwaysMatch; +use sdset::multi::OpBuilder as SdOpBuilder; +use sdset::{SetOperation, Set}; +use crate::metadata::ops_indexed_value::{ + OpIndexedValueBuilder, UnionIndexedValue, +}; +use crate::metadata::doc_indexes::DocIndexes; +use crate::metadata::Metadata; +use crate::automaton::AutomatonExt; +use crate::vec_read_only::VecReadOnly; +use crate::DocIndex; + +pub struct OpBuilder<'m, A: Automaton> { + // the operation on the maps is always an union. + maps: OpIndexedValueBuilder<'m>, + automatons: Vec, + indexes: Vec<&'m DocIndexes>, +} + +impl<'m> OpBuilder<'m, AlwaysMatch> { + pub fn new() -> Self { + Self { + maps: OpIndexedValueBuilder::new(), + automatons: vec![AlwaysMatch], + indexes: Vec::new(), + } + } +} + +/// Do a set operation on multiple maps with the same automatons. +impl<'m, A: 'm + Automaton> OpBuilder<'m, A> { + pub fn with_automatons(automatons: Vec) -> Self { + Self { + maps: OpIndexedValueBuilder::new(), + automatons: automatons, + indexes: Vec::new(), + } + } + + pub fn add(mut self, metadata: &'m Metadata) -> Self where A: Clone { + self.push(metadata); + self + } + + pub fn push(&mut self, metadata: &'m Metadata) where A: Clone { + let mut op = map::OpBuilder::new(); + for automaton in self.automatons.iter().cloned() { + let stream = metadata.as_map().search(automaton); + op.push(stream); + } + + let stream = op.union(); + let indexes = metadata.as_indexes(); + + self.maps.push(stream); + self.indexes.push(indexes); + } + + pub fn union(self) -> Union<'m> { + Union::new(self.maps, self.indexes) + } + + pub fn intersection(self) -> Intersection<'m> { + Intersection::new(self.maps, self.indexes) + } + + pub fn difference(self) -> Difference<'m> { + Difference::new(self.maps, self.indexes) + } + + pub fn symmetric_difference(self) -> SymmetricDifference<'m> { + SymmetricDifference::new(self.maps, self.indexes) + } +} + +#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct IndexedDocIndexes { + pub index: usize, + pub doc_indexes: VecReadOnly, +} + +struct SlotIndexedDocIndexes { + index: usize, + start: usize, + len: usize, +} + +macro_rules! logical_operation { + (struct $name:ident, $operation:ident) => { + +pub struct $name<'m> { + maps: UnionIndexedValue<'m>, + indexes: Vec<&'m DocIndexes>, + outs: Vec, +} + +impl<'m> $name<'m> { + fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>) -> Self + { + $name { + maps: maps.union(), + indexes: indexes, + outs: Vec::new(), + } + } +} + +impl<'m, 'a> fst::Streamer<'a> for $name<'m> { + type Item = (&'a [u8], &'a [IndexedDocIndexes]); + + fn next(&'a mut self) -> Option { + match self.maps.next() { + Some((input, ivalues)) => { + self.outs.clear(); + + // @Improvement: better use a `Vec` instead, + // `aut indexes` follow them selfs + let mut builders = HashMap::new(); + for iv in ivalues { + let builder = builders.entry(iv.aut_index).or_insert_with(BTreeMap::new); + builder.insert(iv.rdr_index, iv.value); + } + + let mut doc_indexes = Vec::new(); + let mut doc_indexes_slots = Vec::with_capacity(builders.len()); + for (aut_index, values) in builders.into_iter() { + let mut builder = SdOpBuilder::with_capacity(values.len()); + for (rdr_index, value) in values { + let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes"); + let indexes = Set::new_unchecked(indexes); + builder.push(indexes); + } + + let start = doc_indexes.len(); + builder.$operation().extend_vec(&mut doc_indexes); + let len = doc_indexes.len() - start; + if len == 0 { continue } + + let slot = SlotIndexedDocIndexes { + index: aut_index, + start: start, + len: len, + }; + doc_indexes_slots.push(slot); + } + + let read_only = VecReadOnly::new(doc_indexes); + self.outs.reserve(doc_indexes_slots.len()); + for slot in doc_indexes_slots { + let indexes = IndexedDocIndexes { + index: slot.index, + doc_indexes: read_only.range(slot.start, slot.len), + }; + self.outs.push(indexes); + } + + if self.outs.is_empty() { return None } + Some((input, &self.outs)) + }, + None => None, + } + } +} +}} + +logical_operation!(struct Union, union); +logical_operation!(struct Intersection, intersection); +logical_operation!(struct Difference, difference); +logical_operation!(struct SymmetricDifference, symmetric_difference); + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::MetadataBuilder; + + fn get_exact_key<'m, I, S>(stream: I, key: &[u8]) -> Option> + where + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>, + S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>, + { + let mut stream = stream.into_stream(); + while let Some((string, indexes)) = stream.next() { + if string == key { + return Some(indexes[0].doc_indexes.clone()) + } + } + None + } + + #[test] + fn union_two_metadata() { + let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + + let meta1 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = MetadataBuilder::new(mapw, indexesw); + + builder.insert("chameau".into(), doc1); + + let (map, indexes) = builder.into_inner().unwrap(); + Metadata::from_bytes(map, indexes).unwrap() + }; + + let meta2 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = MetadataBuilder::new(mapw, indexesw); + + builder.insert("chameau".into(), doc2); + + let (map, indexes) = builder.into_inner().unwrap(); + Metadata::from_bytes(map, indexes).unwrap() + }; + + let metas = OpBuilder::new().add(&meta1).add(&meta2).union(); + let value = get_exact_key(metas, b"chameau"); + + assert_eq!(&*value.unwrap(), &[doc1, doc2][..]); + } + + #[test] + fn intersection_two_metadata() { + let doc1 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + + let meta1 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = MetadataBuilder::new(mapw, indexesw); + + builder.insert("chameau".into(), doc1); + + let (map, indexes) = builder.into_inner().unwrap(); + Metadata::from_bytes(map, indexes).unwrap() + }; + + let meta2 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = MetadataBuilder::new(mapw, indexesw); + + builder.insert("chameau".into(), doc2); + + let (map, indexes) = builder.into_inner().unwrap(); + Metadata::from_bytes(map, indexes).unwrap() + }; + + let metas = OpBuilder::new().add(&meta1).add(&meta2).intersection(); + let value = get_exact_key(metas, b"chameau"); + + assert_eq!(&*value.unwrap(), &[doc1][..]); + } + + #[test] + fn difference_two_metadata() { + let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + let doc3 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + + let meta1 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = MetadataBuilder::new(mapw, indexesw); + + builder.insert("chameau".into(), doc1); + builder.insert("chameau".into(), doc2); + + let (map, indexes) = builder.into_inner().unwrap(); + Metadata::from_bytes(map, indexes).unwrap() + }; + + let meta2 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = MetadataBuilder::new(mapw, indexesw); + + builder.insert("chameau".into(), doc3); + + let (map, indexes) = builder.into_inner().unwrap(); + Metadata::from_bytes(map, indexes).unwrap() + }; + + let metas = OpBuilder::new().add(&meta1).add(&meta2).difference(); + let value = get_exact_key(metas, b"chameau"); + + assert_eq!(&*value.unwrap(), &[doc1][..]); + } + + #[test] + fn symmetric_difference_two_metadata() { + let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; + let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; + let doc3 = DocIndex { document: 32, attribute: 0, attribute_index: 1 }; + let doc4 = DocIndex { document: 34, attribute: 12, attribute_index: 1 }; + + let meta1 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = MetadataBuilder::new(mapw, indexesw); + + builder.insert("chameau".into(), doc1); + builder.insert("chameau".into(), doc2); + builder.insert("chameau".into(), doc3); + + let (map, indexes) = builder.into_inner().unwrap(); + Metadata::from_bytes(map, indexes).unwrap() + }; + + let meta2 = { + let mapw = Vec::new(); + let indexesw = Vec::new(); + let mut builder = MetadataBuilder::new(mapw, indexesw); + + builder.insert("chameau".into(), doc2); + builder.insert("chameau".into(), doc3); + builder.insert("chameau".into(), doc4); + + let (map, indexes) = builder.into_inner().unwrap(); + Metadata::from_bytes(map, indexes).unwrap() + }; + + let metas = OpBuilder::new().add(&meta1).add(&meta2).symmetric_difference(); + let value = get_exact_key(metas, b"chameau"); + + assert_eq!(&*value.unwrap(), &[doc1, doc4][..]); + } +} diff --git a/raptor/src/metadata/ops_indexed_value.rs b/raptor/src/metadata/ops_indexed_value.rs new file mode 100644 index 000000000..558b57447 --- /dev/null +++ b/raptor/src/metadata/ops_indexed_value.rs @@ -0,0 +1,197 @@ +use std::collections::BinaryHeap; +use std::rc::Rc; +use std::cmp; +use fst::raw::{self, Output}; +use fst::{self, IntoStreamer, Streamer}; + +type BoxedStream<'f> = Box Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])> + 'f>; + +pub struct OpIndexedValueBuilder<'f> { + streams: Vec>, +} + +impl<'f> OpIndexedValueBuilder<'f> { + pub fn new() -> Self { + Self { streams: Vec::new() } + } + + pub fn push(&mut self, stream: I) + where + I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [raw::IndexedValue])>, + S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])>, + { + self.streams.push(Box::new(stream.into_stream())); + } + + pub fn union(self) -> UnionIndexedValue<'f> { + UnionIndexedValue { + heap: StreamIndexedValueHeap::new(self.streams), + outs: Vec::new(), + cur_slot: None, + } + } +} + +pub struct UnionIndexedValue<'f> { + heap: StreamIndexedValueHeap<'f>, + outs: Vec, + cur_slot: Option, +} + +impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> { + type Item = (&'a [u8], &'a [IndexedValue]); + + fn next(&'a mut self) -> Option { + if let Some(slot) = self.cur_slot.take() { + self.heap.refill(slot); + } + let slot = match self.heap.pop() { + None => return None, + Some(slot) => { + self.cur_slot = Some(slot); + self.cur_slot.as_mut().unwrap() + } + }; + self.outs.clear(); + self.outs.push(slot.indexed_value()); + while let Some(mut slot2) = self.heap.pop_if_equal(slot.input()) { + self.outs.push(slot2.indexed_value()); + self.heap.refill(slot2); + } + Some((slot.input(), &self.outs)) + } +} + +struct StreamIndexedValueHeap<'f> { + rdrs: Vec>, + heap: BinaryHeap, +} + +impl<'f> StreamIndexedValueHeap<'f> { + fn new(streams: Vec>) -> StreamIndexedValueHeap<'f> { + let mut u = StreamIndexedValueHeap { + rdrs: streams, + heap: BinaryHeap::new(), + }; + for i in 0..u.rdrs.len() { + u.refill(SlotIndexedValue::new(i)); + } + u + } + + fn pop(&mut self) -> Option { + self.heap.pop() + } + + fn peek_is_duplicate(&self, key: &[u8]) -> bool { + self.heap.peek().map(|s| s.input() == key).unwrap_or(false) + } + + fn pop_if_equal(&mut self, key: &[u8]) -> Option { + if self.peek_is_duplicate(key) { + self.pop() + } else { + None + } + } + + fn pop_if_le(&mut self, key: &[u8]) -> Option { + if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) { + self.pop() + } else { + None + } + } + + fn num_slots(&self) -> usize { + self.rdrs.len() + } + + fn refill(&mut self, mut slot: SlotIndexedValue) { + if let Some((input, ivalues)) = self.rdrs[slot.rdr_index].next() { + slot.set_input(input); + for values in ivalues { + slot.set_aut_index(values.index); + slot.set_output(values.value); + self.heap.push(slot.clone()); + } + } + } +} + +#[derive(Debug, Clone)] +struct SlotIndexedValue { + rdr_index: usize, + aut_index: usize, + input: Rc>, + output: Output, +} + +#[derive(Debug)] +pub struct IndexedValue { + pub rdr_index: usize, + pub aut_index: usize, + pub value: u64, +} + +impl PartialEq for SlotIndexedValue { + fn eq(&self, other: &Self) -> bool { + (&self.input, self.rdr_index, self.aut_index, self.output) + .eq(&(&other.input, other.rdr_index, other.aut_index, other.output)) + } +} + +impl Eq for SlotIndexedValue { } + +impl PartialOrd for SlotIndexedValue { + fn partial_cmp(&self, other: &Self) -> Option { + (&self.input, self.rdr_index, self.aut_index, self.output) + .partial_cmp(&(&other.input, other.rdr_index, other.aut_index, other.output)) + .map(|ord| ord.reverse()) + } +} + +impl Ord for SlotIndexedValue { + fn cmp(&self, other: &Self) -> cmp::Ordering { + self.partial_cmp(other).unwrap() + } +} + +impl SlotIndexedValue { + fn new(rdr_index: usize) -> SlotIndexedValue { + SlotIndexedValue { + rdr_index: rdr_index, + aut_index: 0, + input: Rc::new(Vec::with_capacity(64)), + output: Output::zero(), + } + } + + fn indexed_value(&self) -> IndexedValue { + IndexedValue { + rdr_index: self.rdr_index, + aut_index: self.aut_index, + value: self.output.value(), + } + } + + fn input(&self) -> &[u8] { + &self.input + } + + fn set_aut_index(&mut self, aut_index: usize) { + self.aut_index = aut_index; + } + + fn set_input(&mut self, input: &[u8]) { + if *self.input != input { + let inner = Rc::make_mut(&mut self.input); + inner.clear(); + inner.extend(input); + } + } + + fn set_output(&mut self, output: u64) { + self.output = Output::new(output); + } +} diff --git a/raptor/src/rank/mod.rs b/raptor/src/rank/mod.rs index 0c857e6aa..bfe49dc9e 100644 --- a/raptor/src/rank/mod.rs +++ b/raptor/src/rank/mod.rs @@ -8,11 +8,13 @@ mod exact; use std::cmp::Ordering; use std::rc::Rc; use std::{mem, vec}; -use fst; +use fst::Streamer; use fnv::FnvHashMap; use group_by::GroupByMut; use crate::automaton::{DfaExt, AutomatonExt}; -use crate::metadata::{DocIndexes, OpBuilder, Union}; +use crate::metadata::Metadata; +use crate::metadata::ops::{OpBuilder, Union}; +use crate::metadata::doc_indexes::DocIndexes; use crate::{Match, DocumentId}; use self::{ @@ -83,20 +85,16 @@ fn matches_into_iter(matches: FnvHashMap>, limit: usize) documents.into_iter() } -pub struct RankedStream<'m, 'v>(RankedStreamInner<'m, 'v>); - -impl<'m, 'v> RankedStream<'m, 'v> { - pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec, limit: usize) -> Self { - let mut op = OpBuilder::new(indexes); +pub struct RankedStream<'m>(RankedStreamInner<'m>); +impl<'m> RankedStream<'m> { + pub fn new(metadata: &'m Metadata, automatons: Vec, limit: usize) -> Self { let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect(); - for automaton in automatons.iter().cloned() { - let stream = map.search(automaton); - op.push(stream); - } + let mut builder = OpBuilder::with_automatons(automatons.clone()); + builder.push(metadata); let inner = RankedStreamInner::Fed { - inner: op.union(), + inner: builder.union(), automatons: automatons, limit: limit, matches: FnvHashMap::default(), @@ -106,7 +104,7 @@ impl<'m, 'v> RankedStream<'m, 'v> { } } -impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> { +impl<'m, 'a> fst::Streamer<'a> for RankedStream<'m> { type Item = Document; fn next(&'a mut self) -> Option { @@ -114,9 +112,9 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> { } } -enum RankedStreamInner<'m, 'v> { +enum RankedStreamInner<'m> { Fed { - inner: Union<'m, 'v>, + inner: Union<'m>, automatons: Vec>, limit: usize, matches: FnvHashMap>, @@ -126,7 +124,7 @@ enum RankedStreamInner<'m, 'v> { }, } -impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> { +impl<'m, 'a> fst::Streamer<'a> for RankedStreamInner<'m> { type Item = Document; fn next(&'a mut self) -> Option { @@ -141,7 +139,7 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> { let distance = automaton.eval(string).to_u8(); let same_length = string.len() == automaton.query_len(); - for di in iv.values { + for di in iv.doc_indexes.as_slice() { let match_ = Match { query_index: iv.index as u32, distance: distance, diff --git a/raptor/src/vec_read_only.rs b/raptor/src/vec_read_only.rs new file mode 100644 index 000000000..c0d5b6403 --- /dev/null +++ b/raptor/src/vec_read_only.rs @@ -0,0 +1,44 @@ +use std::ops::Deref; +use std::sync::Arc; + +#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct VecReadOnly { + inner: Arc>, + offset: usize, + len: usize, +} + +impl VecReadOnly { + pub fn new(vec: Vec) -> Self { + let len = vec.len(); + Self { + inner: Arc::new(vec), + offset: 0, + len: len, + } + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn range(&self, offset: usize, len: usize) -> Self { + Self { + inner: self.inner.clone(), + offset: self.offset + offset, + len: len, + } + } + + pub fn as_slice(&self) -> &[T] { + &self.inner[self.offset..self.offset + self.len] + } +} + +impl Deref for VecReadOnly { + type Target = [T]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +}