use std::collections::btree_map::{BTreeMap, Iter, Entry}; use std::slice::from_raw_parts; use std::io::{self, Write}; use std::path::Path; use std::ops::Deref; use std::sync::Arc; use std::mem; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use fst::raw::MmapReadOnly; use crate::DocIndex; #[repr(C)] struct Range { start: u64, end: u64, } #[derive(Clone)] enum DocIndexesData { Shared { vec: Arc>, offset: usize, len: usize, }, Mmap(MmapReadOnly), } impl Deref for DocIndexesData { type Target = [u8]; fn deref(&self) -> &Self::Target { match self { DocIndexesData::Shared { vec, offset, len } => { &vec[*offset..offset + len] }, DocIndexesData::Mmap(m) => m.as_slice(), } } } #[derive(Clone)] pub struct DocIndexes { ranges: DocIndexesData, indexes: DocIndexesData, } impl DocIndexes { pub unsafe fn from_path>(path: P) -> io::Result { let mmap = MmapReadOnly::open_path(path)?; let range_len = mmap.as_slice().read_u64::()?; let range_len = range_len as usize * mem::size_of::(); let offset = mem::size_of::() as usize; let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len)); let len = mmap.len() - range_len - offset; let offset = offset + range_len; let indexes = DocIndexesData::Mmap(mmap.range(offset, len)); Ok(DocIndexes { ranges, indexes }) } pub fn from_bytes(vec: Vec) -> io::Result { let vec = Arc::new(vec); let range_len = vec.as_slice().read_u64::()?; let range_len = range_len as usize * mem::size_of::(); let offset = mem::size_of::() as usize; let ranges = DocIndexesData::Shared { vec: vec.clone(), offset, len: range_len }; let len = vec.len() - range_len - offset; let offset = offset + range_len; let indexes = DocIndexesData::Shared { vec, offset, len }; Ok(DocIndexes { ranges, indexes }) } pub fn get(&self, index: u64) -> Option<&[DocIndex]> { self.ranges().get(index as usize).map(|Range { start, end }| { let start = *start as usize; let end = *end as usize; &self.indexes()[start..end] }) } fn ranges(&self) -> &[Range] { let slice = &self.ranges; let ptr = slice.as_ptr() as *const Range; let len = slice.len() / mem::size_of::(); unsafe { from_raw_parts(ptr, len) } } fn indexes(&self) -> &[DocIndex] { let slice = &self.indexes; let ptr = slice.as_ptr() as *const DocIndex; let len = slice.len() / mem::size_of::(); unsafe { from_raw_parts(ptr, len) } } } pub struct DocIndexesBuilder { keys: BTreeMap, indexes: Vec>, number_docs: usize, wtr: W, } impl DocIndexesBuilder { pub fn new(wtr: W) -> Self { Self { keys: BTreeMap::new(), indexes: Vec::new(), number_docs: 0, wtr: wtr, } } pub fn number_doc_indexes(&self) -> usize { self.number_docs } pub fn insert(&mut self, key: String, value: DocIndex) { match self.keys.entry(key) { Entry::Vacant(e) => { let index = self.indexes.len() as u64; self.indexes.push(vec![value]); e.insert(index); }, Entry::Occupied(e) => { let index = *e.get(); let vec = &mut self.indexes[index as usize]; vec.push(value); }, } self.number_docs += 1; } pub fn keys(&self) -> Iter { self.keys.iter() } pub fn finish(self) -> io::Result<()> { self.into_inner().map(|_| ()) } pub fn into_inner(mut self) -> io::Result { for vec in &mut self.indexes { vec.sort_unstable(); } let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs); let len = ranges.len() as u64; // TODO check if this is correct self.wtr.write_u64::(len)?; unsafe { // write Ranges first let slice = into_u8_slice(ranges.as_slice()); self.wtr.write_all(slice)?; // write Values after let slice = into_u8_slice(values.as_slice()); self.wtr.write_all(slice)?; } self.wtr.flush()?; Ok(self.wtr) } } fn into_sliced_ranges(vecs: Vec>, number_docs: usize) -> (Vec, Vec) { let cap = vecs.len(); let mut ranges = Vec::with_capacity(cap); let mut values = Vec::with_capacity(number_docs); // @Improvement: remove bounds duplications: the left bound of a range // is already the right bound of the previous range, // we could use a slice window of size 2. for v in &vecs { let len = v.len() as u64; let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); let range = Range { start, end: start + len }; ranges.push(range); } values.extend(vecs.into_iter().flatten()); (ranges, values) } unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { let ptr = slice.as_ptr() as *const u8; let len = slice.len() * mem::size_of::(); from_raw_parts(ptr, len) }