2018-10-28 21:24:04 +08:00
|
|
|
use std::slice::from_raw_parts;
|
|
|
|
use std::io::{self, Write};
|
|
|
|
use std::path::Path;
|
|
|
|
use std::sync::Arc;
|
|
|
|
use std::mem;
|
2018-11-08 19:05:59 +08:00
|
|
|
|
2018-10-28 21:24:04 +08:00
|
|
|
use fst::raw::MmapReadOnly;
|
|
|
|
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
2018-11-24 01:00:24 +08:00
|
|
|
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
2018-11-08 19:05:59 +08:00
|
|
|
|
2018-10-28 21:24:04 +08:00
|
|
|
use crate::DocIndex;
|
2018-11-08 19:05:59 +08:00
|
|
|
use crate::data::Data;
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
#[derive(Debug)]
|
2018-10-28 21:24:04 +08:00
|
|
|
#[repr(C)]
|
|
|
|
struct Range {
|
|
|
|
start: u64,
|
|
|
|
end: u64,
|
|
|
|
}
|
|
|
|
|
2018-11-27 00:30:19 +08:00
|
|
|
#[derive(Clone, Default)]
|
2018-10-28 21:24:04 +08:00
|
|
|
pub struct DocIndexes {
|
2018-11-08 19:05:59 +08:00
|
|
|
ranges: Data,
|
|
|
|
indexes: Data,
|
2018-10-28 21:24:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl DocIndexes {
|
|
|
|
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
|
|
|
let mmap = MmapReadOnly::open_path(path)?;
|
2018-11-29 00:12:24 +08:00
|
|
|
DocIndexes::from_data(Data::Mmap(mmap))
|
2018-10-28 21:24:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
2018-11-29 00:12:24 +08:00
|
|
|
let len = vec.len();
|
|
|
|
DocIndexes::from_shared_bytes(Arc::new(vec), 0, len)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
|
|
|
|
let data = Data::Shared { bytes, offset, len };
|
|
|
|
DocIndexes::from_data(data)
|
|
|
|
}
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-11-29 00:12:24 +08:00
|
|
|
fn from_data(data: Data) -> io::Result<Self> {
|
|
|
|
let ranges_len_offset = data.len() - mem::size_of::<u64>();
|
|
|
|
let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
|
2018-12-01 18:35:16 +08:00
|
|
|
let ranges_len = ranges_len as usize;
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-11-27 00:30:19 +08:00
|
|
|
let ranges_offset = ranges_len_offset - ranges_len;
|
2018-11-29 00:12:24 +08:00
|
|
|
let ranges = data.range(ranges_offset, ranges_len);
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-11-29 00:12:24 +08:00
|
|
|
let indexes = data.range(0, ranges_offset);
|
2018-10-28 21:24:04 +08:00
|
|
|
|
|
|
|
Ok(DocIndexes { ranges, indexes })
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn get(&self, index: u64) -> Option<&[DocIndex]> {
|
|
|
|
self.ranges().get(index as usize).map(|Range { start, end }| {
|
|
|
|
let start = *start as usize;
|
|
|
|
let end = *end as usize;
|
|
|
|
&self.indexes()[start..end]
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
fn ranges(&self) -> &[Range] {
|
|
|
|
let slice = &self.ranges;
|
|
|
|
let ptr = slice.as_ptr() as *const Range;
|
|
|
|
let len = slice.len() / mem::size_of::<Range>();
|
|
|
|
unsafe { from_raw_parts(ptr, len) }
|
|
|
|
}
|
|
|
|
|
|
|
|
fn indexes(&self) -> &[DocIndex] {
|
|
|
|
let slice = &self.indexes;
|
|
|
|
let ptr = slice.as_ptr() as *const DocIndex;
|
|
|
|
let len = slice.len() / mem::size_of::<DocIndex>();
|
|
|
|
unsafe { from_raw_parts(ptr, len) }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-24 01:00:24 +08:00
|
|
|
impl Serialize for DocIndexes {
|
|
|
|
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
|
|
|
let mut tuple = serializer.serialize_tuple(2)?;
|
|
|
|
tuple.serialize_element(self.ranges.as_ref())?;
|
|
|
|
tuple.serialize_element(self.indexes.as_ref())?;
|
|
|
|
tuple.end()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
pub struct DocIndexesBuilder<W> {
|
2018-11-27 00:30:19 +08:00
|
|
|
ranges: Vec<Range>,
|
|
|
|
wtr: W,
|
|
|
|
}
|
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
impl DocIndexesBuilder<Vec<u8>> {
|
2018-11-27 00:30:19 +08:00
|
|
|
pub fn memory() -> Self {
|
2018-12-01 18:35:16 +08:00
|
|
|
DocIndexesBuilder::new(Vec::new())
|
2018-11-27 00:30:19 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
impl<W: Write> DocIndexesBuilder<W> {
|
2018-11-27 00:30:19 +08:00
|
|
|
pub fn new(wtr: W) -> Self {
|
2018-12-01 18:35:16 +08:00
|
|
|
DocIndexesBuilder {
|
2018-11-27 00:30:19 +08:00
|
|
|
ranges: Vec::new(),
|
|
|
|
wtr: wtr,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn insert(&mut self, indexes: &[DocIndex]) -> io::Result<()> {
|
|
|
|
let len = indexes.len() as u64;
|
2018-12-01 18:35:16 +08:00
|
|
|
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
|
2018-11-27 00:30:19 +08:00
|
|
|
let range = Range { start, end: start + len };
|
|
|
|
self.ranges.push(range);
|
|
|
|
|
|
|
|
// write the values
|
|
|
|
let indexes = unsafe { into_u8_slice(indexes) };
|
|
|
|
self.wtr.write_all(indexes)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn finish(self) -> io::Result<()> {
|
|
|
|
self.into_inner().map(drop)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn into_inner(mut self) -> io::Result<W> {
|
|
|
|
// write the ranges
|
|
|
|
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
|
|
|
|
self.wtr.write_all(ranges)?;
|
|
|
|
|
|
|
|
// write the length of the ranges
|
|
|
|
let len = ranges.len() as u64;
|
|
|
|
self.wtr.write_u64::<LittleEndian>(len)?;
|
|
|
|
|
|
|
|
Ok(self.wtr)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
|
|
|
|
let ptr = slice.as_ptr() as *const u8;
|
|
|
|
let len = slice.len() * mem::size_of::<T>();
|
|
|
|
from_raw_parts(ptr, len)
|
2018-10-28 21:24:04 +08:00
|
|
|
}
|
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
use std::error::Error;
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
#[test]
|
|
|
|
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
|
|
|
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
|
|
|
|
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
|
|
|
|
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
|
2018-11-27 00:30:19 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
let mut builder = DocIndexesBuilder::memory();
|
2018-11-27 00:30:19 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
builder.insert(&[a])?;
|
|
|
|
builder.insert(&[a, b, c])?;
|
|
|
|
builder.insert(&[a, c])?;
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
let bytes = builder.into_inner()?;
|
|
|
|
let docs = DocIndexes::from_bytes(bytes)?;
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
assert_eq!(docs.get(0).unwrap(), &[a]);
|
|
|
|
assert_eq!(docs.get(1).unwrap(), &[a, b, c]);
|
|
|
|
assert_eq!(docs.get(2).unwrap(), &[a, c]);
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
Ok(())
|
2018-10-28 21:24:04 +08:00
|
|
|
}
|
|
|
|
}
|