2019-01-01 01:33:59 +08:00
|
|
|
use std::io::{self, Write, Cursor, BufRead};
|
2018-10-28 21:24:04 +08:00
|
|
|
use std::slice::from_raw_parts;
|
2018-12-02 01:37:21 +08:00
|
|
|
use std::mem::size_of;
|
|
|
|
use std::ops::Index;
|
2018-10-28 21:24:04 +08:00
|
|
|
use std::sync::Arc;
|
2018-11-08 19:05:59 +08:00
|
|
|
|
2018-10-28 21:24:04 +08:00
|
|
|
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
2018-12-09 21:18:23 +08:00
|
|
|
use sdset::Set;
|
2018-11-08 19:05:59 +08:00
|
|
|
|
2018-10-28 21:24:04 +08:00
|
|
|
use crate::DocIndex;
|
2018-12-30 20:22:02 +08:00
|
|
|
use crate::data::SharedData;
|
2019-01-01 01:33:59 +08:00
|
|
|
use super::into_u8_slice;
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
#[derive(Debug)]
|
2018-10-28 21:24:04 +08:00
|
|
|
#[repr(C)]
|
|
|
|
struct Range {
|
|
|
|
start: u64,
|
|
|
|
end: u64,
|
|
|
|
}
|
|
|
|
|
2018-11-27 00:30:19 +08:00
|
|
|
#[derive(Clone, Default)]
|
2018-10-28 21:24:04 +08:00
|
|
|
pub struct DocIndexes {
|
2018-12-30 20:22:02 +08:00
|
|
|
ranges: SharedData,
|
|
|
|
indexes: SharedData,
|
2018-10-28 21:24:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl DocIndexes {
|
2019-01-01 01:33:59 +08:00
|
|
|
pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
|
|
|
|
let bytes = Arc::new(bytes);
|
|
|
|
let len = bytes.len();
|
|
|
|
let data = SharedData::new(bytes, 0, len);
|
|
|
|
let mut cursor = Cursor::new(data);
|
|
|
|
DocIndexes::from_cursor(&mut cursor)
|
2018-11-29 00:12:24 +08:00
|
|
|
}
|
|
|
|
|
2019-01-01 01:33:59 +08:00
|
|
|
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
|
|
|
|
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
|
|
|
let offset = cursor.position() as usize;
|
|
|
|
let ranges = cursor.get_ref().range(offset, len);
|
|
|
|
cursor.consume(len);
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2019-01-01 01:33:59 +08:00
|
|
|
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
|
|
|
let offset = cursor.position() as usize;
|
|
|
|
let indexes = cursor.get_ref().range(offset, len);
|
|
|
|
cursor.consume(len);
|
2018-10-28 21:24:04 +08:00
|
|
|
|
|
|
|
Ok(DocIndexes { ranges, indexes })
|
|
|
|
}
|
|
|
|
|
2018-12-30 23:17:18 +08:00
|
|
|
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
|
|
|
let ranges_len = self.ranges.len() as u64;
|
2019-01-01 01:33:59 +08:00
|
|
|
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
|
|
|
bytes.extend_from_slice(&self.ranges);
|
2018-12-02 01:37:21 +08:00
|
|
|
|
2019-01-01 01:33:59 +08:00
|
|
|
let indexes_len = self.indexes.len() as u64;
|
|
|
|
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
2018-12-02 01:37:21 +08:00
|
|
|
bytes.extend_from_slice(&self.indexes);
|
|
|
|
}
|
|
|
|
|
2018-12-09 21:18:23 +08:00
|
|
|
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
|
2018-12-30 23:17:18 +08:00
|
|
|
self.ranges().get(index).map(|Range { start, end }| {
|
2018-10-28 21:24:04 +08:00
|
|
|
let start = *start as usize;
|
|
|
|
let end = *end as usize;
|
2018-12-09 21:18:23 +08:00
|
|
|
let slice = &self.indexes()[start..end];
|
|
|
|
Set::new_unchecked(slice)
|
2018-10-28 21:24:04 +08:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
fn ranges(&self) -> &[Range] {
|
|
|
|
let slice = &self.ranges;
|
|
|
|
let ptr = slice.as_ptr() as *const Range;
|
2018-12-02 01:37:21 +08:00
|
|
|
let len = slice.len() / size_of::<Range>();
|
2018-10-28 21:24:04 +08:00
|
|
|
unsafe { from_raw_parts(ptr, len) }
|
|
|
|
}
|
|
|
|
|
|
|
|
fn indexes(&self) -> &[DocIndex] {
|
|
|
|
let slice = &self.indexes;
|
|
|
|
let ptr = slice.as_ptr() as *const DocIndex;
|
2018-12-02 01:37:21 +08:00
|
|
|
let len = slice.len() / size_of::<DocIndex>();
|
2018-10-28 21:24:04 +08:00
|
|
|
unsafe { from_raw_parts(ptr, len) }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-02 01:37:21 +08:00
|
|
|
impl Index<usize> for DocIndexes {
|
|
|
|
type Output = [DocIndex];
|
|
|
|
|
|
|
|
fn index(&self, index: usize) -> &Self::Output {
|
|
|
|
match self.get(index) {
|
|
|
|
Some(indexes) => indexes,
|
|
|
|
None => panic!("index {} out of range for a maximum of {} ranges", index, self.ranges().len()),
|
|
|
|
}
|
2018-11-24 01:00:24 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
pub struct DocIndexesBuilder<W> {
|
2018-11-27 00:30:19 +08:00
|
|
|
ranges: Vec<Range>,
|
2019-01-01 01:33:59 +08:00
|
|
|
indexes: Vec<DocIndex>,
|
2018-11-27 00:30:19 +08:00
|
|
|
wtr: W,
|
|
|
|
}
|
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
impl DocIndexesBuilder<Vec<u8>> {
|
2018-11-27 00:30:19 +08:00
|
|
|
pub fn memory() -> Self {
|
2019-01-01 01:33:59 +08:00
|
|
|
DocIndexesBuilder {
|
|
|
|
ranges: Vec::new(),
|
|
|
|
indexes: Vec::new(),
|
|
|
|
wtr: Vec::new(),
|
|
|
|
}
|
2018-11-27 00:30:19 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
impl<W: Write> DocIndexesBuilder<W> {
|
2018-11-27 00:30:19 +08:00
|
|
|
pub fn new(wtr: W) -> Self {
|
2018-12-01 18:35:16 +08:00
|
|
|
DocIndexesBuilder {
|
2018-11-27 00:30:19 +08:00
|
|
|
ranges: Vec::new(),
|
2019-01-01 01:33:59 +08:00
|
|
|
indexes: Vec::new(),
|
2018-11-27 00:30:19 +08:00
|
|
|
wtr: wtr,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-01-01 01:33:59 +08:00
|
|
|
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
|
2018-11-27 00:30:19 +08:00
|
|
|
let len = indexes.len() as u64;
|
2018-12-01 18:35:16 +08:00
|
|
|
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
|
2018-11-27 00:30:19 +08:00
|
|
|
let range = Range { start, end: start + len };
|
|
|
|
self.ranges.push(range);
|
|
|
|
|
2019-01-01 01:33:59 +08:00
|
|
|
self.indexes.extend_from_slice(indexes);
|
2018-11-27 00:30:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn finish(self) -> io::Result<()> {
|
|
|
|
self.into_inner().map(drop)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn into_inner(mut self) -> io::Result<W> {
|
2019-01-01 01:33:59 +08:00
|
|
|
let ranges = unsafe { into_u8_slice(&self.ranges) };
|
|
|
|
let len = ranges.len() as u64;
|
|
|
|
self.wtr.write_u64::<LittleEndian>(len)?;
|
2018-11-27 00:30:19 +08:00
|
|
|
self.wtr.write_all(ranges)?;
|
|
|
|
|
2019-01-01 01:33:59 +08:00
|
|
|
let indexes = unsafe { into_u8_slice(&self.indexes) };
|
|
|
|
let len = indexes.len() as u64;
|
2018-11-27 00:30:19 +08:00
|
|
|
self.wtr.write_u64::<LittleEndian>(len)?;
|
2019-01-01 01:33:59 +08:00
|
|
|
self.wtr.write_all(indexes)?;
|
2018-11-27 00:30:19 +08:00
|
|
|
|
|
|
|
Ok(self.wtr)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use std::error::Error;
|
2018-12-22 19:00:24 +08:00
|
|
|
use crate::DocumentId;
|
2019-02-02 21:28:14 +08:00
|
|
|
use super::*;
|
2018-12-22 19:00:24 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
#[test]
|
2018-12-02 01:37:21 +08:00
|
|
|
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
|
2018-12-28 23:15:22 +08:00
|
|
|
let a = DocIndex {
|
|
|
|
document_id: DocumentId(0),
|
2019-02-02 21:17:50 +08:00
|
|
|
attribute: 3,
|
|
|
|
word_index: 11,
|
|
|
|
char_index: 30,
|
|
|
|
char_length: 4,
|
2018-12-28 23:15:22 +08:00
|
|
|
};
|
|
|
|
let b = DocIndex {
|
|
|
|
document_id: DocumentId(1),
|
2019-02-02 21:17:50 +08:00
|
|
|
attribute: 4,
|
|
|
|
word_index: 21,
|
|
|
|
char_index: 35,
|
|
|
|
char_length: 6,
|
2018-12-28 23:15:22 +08:00
|
|
|
};
|
|
|
|
let c = DocIndex {
|
|
|
|
document_id: DocumentId(2),
|
2019-02-02 21:17:50 +08:00
|
|
|
attribute: 8,
|
|
|
|
word_index: 2,
|
|
|
|
char_index: 89,
|
|
|
|
char_length: 6,
|
2018-12-28 23:15:22 +08:00
|
|
|
};
|
2018-11-27 00:30:19 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
let mut builder = DocIndexesBuilder::memory();
|
2018-11-27 00:30:19 +08:00
|
|
|
|
2019-01-01 01:33:59 +08:00
|
|
|
builder.insert(Set::new(&[a])?);
|
|
|
|
builder.insert(Set::new(&[a, b, c])?);
|
|
|
|
builder.insert(Set::new(&[a, c])?);
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
let bytes = builder.into_inner()?;
|
|
|
|
let docs = DocIndexes::from_bytes(bytes)?;
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-12-09 21:18:23 +08:00
|
|
|
assert_eq!(docs.get(0), Some(Set::new(&[a])?));
|
|
|
|
assert_eq!(docs.get(1), Some(Set::new(&[a, b, c])?));
|
|
|
|
assert_eq!(docs.get(2), Some(Set::new(&[a, c])?));
|
2018-12-02 01:37:21 +08:00
|
|
|
assert_eq!(docs.get(3), None);
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
2018-12-28 23:15:22 +08:00
|
|
|
let a = DocIndex {
|
|
|
|
document_id: DocumentId(0),
|
2019-02-02 21:17:50 +08:00
|
|
|
attribute: 3,
|
|
|
|
word_index: 11,
|
|
|
|
char_index: 30,
|
|
|
|
char_length: 4,
|
2018-12-28 23:15:22 +08:00
|
|
|
};
|
|
|
|
let b = DocIndex {
|
|
|
|
document_id: DocumentId(1),
|
2019-02-02 21:17:50 +08:00
|
|
|
attribute: 4,
|
|
|
|
word_index: 21,
|
|
|
|
char_index: 35,
|
|
|
|
char_length: 6,
|
2018-12-28 23:15:22 +08:00
|
|
|
};
|
|
|
|
let c = DocIndex {
|
|
|
|
document_id: DocumentId(2),
|
2019-02-02 21:17:50 +08:00
|
|
|
attribute: 8,
|
|
|
|
word_index: 2,
|
|
|
|
char_index: 89,
|
|
|
|
char_length: 6,
|
2018-12-28 23:15:22 +08:00
|
|
|
};
|
2018-12-02 01:37:21 +08:00
|
|
|
|
|
|
|
let mut builder = DocIndexesBuilder::memory();
|
|
|
|
|
2019-01-01 01:33:59 +08:00
|
|
|
builder.insert(Set::new(&[a])?);
|
|
|
|
builder.insert(Set::new(&[a, b, c])?);
|
|
|
|
builder.insert(Set::new(&[a, c])?);
|
2018-12-02 01:37:21 +08:00
|
|
|
|
|
|
|
let builder_bytes = builder.into_inner()?;
|
|
|
|
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
|
|
|
|
|
2018-12-30 23:17:18 +08:00
|
|
|
let mut bytes = Vec::new();
|
|
|
|
docs.write_to_bytes(&mut bytes);
|
|
|
|
|
2019-01-01 01:33:59 +08:00
|
|
|
assert_eq!(builder_bytes, bytes);
|
2018-10-28 21:24:04 +08:00
|
|
|
|
2018-12-01 18:35:16 +08:00
|
|
|
Ok(())
|
2018-10-28 21:24:04 +08:00
|
|
|
}
|
|
|
|
}
|