mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 00:55:00 +08:00
feat: Introduce the "data-index" entry with merge compaction
This commit is contained in:
parent
0e856db4e6
commit
b636e5fe57
@ -8,10 +8,9 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
|
|||||||
bincode = "1.0"
|
bincode = "1.0"
|
||||||
byteorder = "1.2"
|
byteorder = "1.2"
|
||||||
fnv = "1.0"
|
fnv = "1.0"
|
||||||
fs2 = "0.4"
|
|
||||||
lazy_static = "1.1"
|
lazy_static = "1.1"
|
||||||
linked-hash-map = { version = "0.5", features = ["serde_impl"] }
|
linked-hash-map = { version = "0.5", features = ["serde_impl"] }
|
||||||
sdset = "0.2"
|
sdset = "0.3"
|
||||||
serde = "1.0"
|
serde = "1.0"
|
||||||
serde_derive = "1.0"
|
serde_derive = "1.0"
|
||||||
unidecode = "0.3"
|
unidecode = "0.3"
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
use crate::vec_read_only::VecReadOnly;
|
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use std::{mem, cmp};
|
|
||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
|
use std::cmp;
|
||||||
|
|
||||||
use fst::{Automaton, Streamer};
|
use fst::{Automaton, Streamer};
|
||||||
use fst::automaton::AlwaysMatch;
|
use fst::automaton::AlwaysMatch;
|
||||||
use sdset::{Set, SetBuf, SetOperation};
|
use sdset::{Set, SetOperation};
|
||||||
use sdset::duo::OpBuilder as SdOpBuilder;
|
use sdset::duo::OpBuilder as SdOpBuilder;
|
||||||
use group_by::GroupBy;
|
use group_by::GroupBy;
|
||||||
|
|
||||||
use crate::blob::{Blob, Sign};
|
|
||||||
use crate::blob::ops::{OpBuilder, Union, IndexedDocIndexes};
|
|
||||||
use crate::DocIndex;
|
use crate::DocIndex;
|
||||||
|
use crate::blob::{Blob, Sign};
|
||||||
|
use crate::vec_read_only::VecReadOnly;
|
||||||
|
use crate::blob::ops::{OpBuilder, Union, IndexedDocIndexes};
|
||||||
|
|
||||||
fn group_is_negative(blobs: &&[Blob]) -> bool {
|
fn group_is_negative(blobs: &&[Blob]) -> bool {
|
||||||
blobs[0].sign() == Sign::Negative
|
blobs[0].sign() == Sign::Negative
|
||||||
|
@ -12,7 +12,6 @@ use std::error::Error;
|
|||||||
use std::io::{Write, Read};
|
use std::io::{Write, Read};
|
||||||
use std::{io, fmt, mem};
|
use std::{io, fmt, mem};
|
||||||
|
|
||||||
use fst::Map;
|
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
use rocksdb::rocksdb::{DB, Snapshot};
|
use rocksdb::rocksdb::{DB, Snapshot};
|
||||||
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
||||||
@ -108,100 +107,3 @@ impl Sign {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
|
||||||
pub struct BlobName(Uuid);
|
|
||||||
|
|
||||||
impl BlobName {
|
|
||||||
pub fn new() -> BlobName {
|
|
||||||
BlobName(Uuid::new_v4())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn as_bytes(&self) -> &[u8; 16] {
|
|
||||||
self.0.as_bytes()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for BlobName {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
f.debug_tuple("BlobName")
|
|
||||||
.field(&self.0.to_hyphenated().to_string())
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
||||||
pub struct BlobInfo {
|
|
||||||
pub sign: Sign,
|
|
||||||
pub name: BlobName,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BlobInfo {
|
|
||||||
pub fn new_positive() -> BlobInfo {
|
|
||||||
BlobInfo {
|
|
||||||
sign: Sign::Positive,
|
|
||||||
name: BlobName::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn new_negative() -> BlobInfo {
|
|
||||||
BlobInfo {
|
|
||||||
sign: Sign::Negative,
|
|
||||||
name: BlobName::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn read_from<R: Read>(reader: R) -> bincode::Result<BlobInfo> {
|
|
||||||
bincode::deserialize_from(reader)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn read_from_slice(slice: &[u8]) -> bincode::Result<Vec<BlobInfo>> {
|
|
||||||
let len = slice.len() / mem::size_of::<BlobInfo>();
|
|
||||||
let mut blob_infos = Vec::with_capacity(len);
|
|
||||||
|
|
||||||
let mut cursor = io::Cursor::new(slice);
|
|
||||||
while blob_infos.len() != len {
|
|
||||||
let blob_info = BlobInfo::read_from(&mut cursor)?;
|
|
||||||
blob_infos.push(blob_info);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(blob_infos)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn write_into<W: Write>(&self, writer: W) -> bincode::Result<()> {
|
|
||||||
bincode::serialize_into(writer, self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn blobs_from_blob_infos(infos: &[BlobInfo], snapshot: &Snapshot<&DB>) -> Result<Vec<Blob>, Box<Error>> {
|
|
||||||
let mut blobs = Vec::with_capacity(infos.len());
|
|
||||||
|
|
||||||
for info in infos {
|
|
||||||
let blob = match info.sign {
|
|
||||||
Sign::Positive => {
|
|
||||||
let blob_key = Identifier::blob(info.name).fst_map().build();
|
|
||||||
let map = match snapshot.get(&blob_key)? {
|
|
||||||
Some(value) => value.to_vec(),
|
|
||||||
None => return Err(format!("No fst entry found for blob {}", info.name).into()),
|
|
||||||
};
|
|
||||||
let blob_key = Identifier::blob(info.name).document_indexes().build();
|
|
||||||
let doc_idx = match snapshot.get(&blob_key)? {
|
|
||||||
Some(value) => value.to_vec(),
|
|
||||||
None => return Err(format!("No doc-idx entry found for blob {}", info.name).into()),
|
|
||||||
};
|
|
||||||
PositiveBlob::from_bytes(map, doc_idx).map(Blob::Positive)?
|
|
||||||
},
|
|
||||||
Sign::Negative => {
|
|
||||||
let blob_key = Identifier::blob(info.name).document_ids().build();
|
|
||||||
let doc_ids = match snapshot.get(&blob_key)? {
|
|
||||||
Some(value) => value.to_vec(),
|
|
||||||
None => return Err(format!("No doc-ids entry found for blob {}", info.name).into()),
|
|
||||||
};
|
|
||||||
NegativeBlob::from_bytes(doc_ids).map(Blob::Negative)?
|
|
||||||
},
|
|
||||||
};
|
|
||||||
blobs.push(blob);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(blobs)
|
|
||||||
}
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::io::{Read, Write};
|
use std::io::Write;
|
||||||
use std::error::Error;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
use crate::DocumentId;
|
use crate::DocumentId;
|
||||||
use crate::data::{DocIds, DocIdsBuilder};
|
use crate::data::{DocIds, DocIdsBuilder};
|
||||||
@ -24,6 +24,10 @@ impl NegativeBlob {
|
|||||||
Ok(NegativeBlob { doc_ids })
|
Ok(NegativeBlob { doc_ids })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn from_raw(doc_ids: DocIds) -> Self {
|
||||||
|
NegativeBlob { doc_ids }
|
||||||
|
}
|
||||||
|
|
||||||
pub fn as_ids(&self) -> &DocIds {
|
pub fn as_ids(&self) -> &DocIds {
|
||||||
&self.doc_ids
|
&self.doc_ids
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::io::{Read, Write};
|
|
||||||
use std::error::Error;
|
|
||||||
use std::path::Path;
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
use std::io::Write;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
use fst::{Map, MapBuilder};
|
use fst::{Map, MapBuilder};
|
||||||
|
|
||||||
@ -10,6 +10,7 @@ use crate::data::{DocIndexes, DocIndexesBuilder};
|
|||||||
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
||||||
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
|
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
pub struct PositiveBlob {
|
pub struct PositiveBlob {
|
||||||
map: Map,
|
map: Map,
|
||||||
indexes: DocIndexes,
|
indexes: DocIndexes,
|
||||||
@ -31,6 +32,10 @@ impl PositiveBlob {
|
|||||||
Ok(PositiveBlob { map, indexes })
|
Ok(PositiveBlob { map, indexes })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn from_raw(map: Map, indexes: DocIndexes) -> Self {
|
||||||
|
PositiveBlob { map, indexes }
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
|
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
|
||||||
self.map.get(key).and_then(|index| self.indexes.get(index))
|
self.map.get(key).and_then(|index| self.indexes.get(index))
|
||||||
}
|
}
|
||||||
@ -109,7 +114,7 @@ impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||||
self.into_inner().map(|_| ())
|
self.into_inner().map(drop)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||||
@ -130,6 +135,10 @@ impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
|
impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
|
||||||
|
pub fn memory() -> Self {
|
||||||
|
PositiveBlobBuilder::new(Vec::new(), Vec::new())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn build(self) -> Result<PositiveBlob, Box<Error>> {
|
pub fn build(self) -> Result<PositiveBlob, Box<Error>> {
|
||||||
self.into_inner().and_then(|(m, i)| PositiveBlob::from_bytes(m, i))
|
self.into_inner().and_then(|(m, i)| PositiveBlob::from_bytes(m, i))
|
||||||
}
|
}
|
||||||
|
@ -35,6 +35,10 @@ impl DocIds {
|
|||||||
Ok(DocIds { data })
|
Ok(DocIds { data })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
|
||||||
|
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn contains(&self, doc: DocumentId) -> bool {
|
pub fn contains(&self, doc: DocumentId) -> bool {
|
||||||
// FIXME prefer using the sdset::exponential_search function
|
// FIXME prefer using the sdset::exponential_search function
|
||||||
self.doc_ids().binary_search(&doc).is_ok()
|
self.doc_ids().binary_search(&doc).is_ok()
|
||||||
|
@ -19,7 +19,7 @@ struct Range {
|
|||||||
end: u64,
|
end: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone, Default)]
|
||||||
pub struct DocIndexes {
|
pub struct DocIndexes {
|
||||||
ranges: Data,
|
ranges: Data,
|
||||||
indexes: Data,
|
indexes: Data,
|
||||||
@ -29,15 +29,14 @@ impl DocIndexes {
|
|||||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||||
let mmap = MmapReadOnly::open_path(path)?;
|
let mmap = MmapReadOnly::open_path(path)?;
|
||||||
|
|
||||||
let range_len = mmap.as_slice().read_u64::<LittleEndian>()?;
|
let ranges_len_offset = mmap.as_slice().len() - mem::size_of::<u64>();
|
||||||
let range_len = range_len as usize * mem::size_of::<Range>();
|
let ranges_len = (&mmap.as_slice()[ranges_len_offset..]).read_u64::<LittleEndian>()?;
|
||||||
|
let ranges_len = ranges_len as usize * mem::size_of::<Range>();
|
||||||
|
|
||||||
let offset = mem::size_of::<u64>() as usize;
|
let ranges_offset = ranges_len_offset - ranges_len;
|
||||||
let ranges = Data::Mmap(mmap.range(offset, range_len));
|
let ranges = Data::Mmap(mmap.range(ranges_offset, ranges_len));
|
||||||
|
|
||||||
let len = mmap.len() - range_len - offset;
|
let indexes = Data::Mmap(mmap.range(0, ranges_offset));
|
||||||
let offset = offset + range_len;
|
|
||||||
let indexes = Data::Mmap(mmap.range(offset, len));
|
|
||||||
|
|
||||||
Ok(DocIndexes { ranges, indexes })
|
Ok(DocIndexes { ranges, indexes })
|
||||||
}
|
}
|
||||||
@ -45,19 +44,22 @@ impl DocIndexes {
|
|||||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||||
let vec = Arc::new(vec);
|
let vec = Arc::new(vec);
|
||||||
|
|
||||||
let range_len = vec.as_slice().read_u64::<LittleEndian>()?;
|
let ranges_len_offset = vec.len() - mem::size_of::<u64>();
|
||||||
let range_len = range_len as usize * mem::size_of::<Range>();
|
let ranges_len = (&vec[ranges_len_offset..]).read_u64::<LittleEndian>()?;
|
||||||
|
let ranges_len = ranges_len as usize * mem::size_of::<Range>();
|
||||||
|
|
||||||
let offset = mem::size_of::<u64>() as usize;
|
let ranges_offset = ranges_len_offset - ranges_len;
|
||||||
let ranges = Data::Shared {
|
let ranges = Data::Shared {
|
||||||
vec: vec.clone(),
|
vec: vec.clone(),
|
||||||
offset,
|
offset: ranges_offset,
|
||||||
len: range_len
|
len: ranges_len,
|
||||||
};
|
};
|
||||||
|
|
||||||
let len = vec.len() - range_len - offset;
|
let indexes = Data::Shared {
|
||||||
let offset = offset + range_len;
|
vec: vec,
|
||||||
let indexes = Data::Shared { vec, offset, len };
|
offset: 0,
|
||||||
|
len: ranges_offset,
|
||||||
|
};
|
||||||
|
|
||||||
Ok(DocIndexes { ranges, indexes })
|
Ok(DocIndexes { ranges, indexes })
|
||||||
}
|
}
|
||||||
@ -94,6 +96,53 @@ impl Serialize for DocIndexes {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct RawDocIndexesBuilder<W> {
|
||||||
|
ranges: Vec<Range>,
|
||||||
|
wtr: W,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RawDocIndexesBuilder<Vec<u8>> {
|
||||||
|
pub fn memory() -> Self {
|
||||||
|
RawDocIndexesBuilder::new(Vec::new())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<W: Write> RawDocIndexesBuilder<W> {
|
||||||
|
pub fn new(wtr: W) -> Self {
|
||||||
|
RawDocIndexesBuilder {
|
||||||
|
ranges: Vec::new(),
|
||||||
|
wtr: wtr,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert(&mut self, indexes: &[DocIndex]) -> io::Result<()> {
|
||||||
|
let len = indexes.len() as u64;
|
||||||
|
let start = self.ranges.last().map(|r| r.start).unwrap_or(0);
|
||||||
|
let range = Range { start, end: start + len };
|
||||||
|
self.ranges.push(range);
|
||||||
|
|
||||||
|
// write the values
|
||||||
|
let indexes = unsafe { into_u8_slice(indexes) };
|
||||||
|
self.wtr.write_all(indexes)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn finish(self) -> io::Result<()> {
|
||||||
|
self.into_inner().map(drop)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_inner(mut self) -> io::Result<W> {
|
||||||
|
// write the ranges
|
||||||
|
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
|
||||||
|
self.wtr.write_all(ranges)?;
|
||||||
|
|
||||||
|
// write the length of the ranges
|
||||||
|
let len = ranges.len() as u64;
|
||||||
|
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||||
|
|
||||||
|
Ok(self.wtr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct DocIndexesBuilder<W> {
|
pub struct DocIndexesBuilder<W> {
|
||||||
keys: BTreeMap<String, u64>,
|
keys: BTreeMap<String, u64>,
|
||||||
indexes: Vec<Vec<DocIndex>>,
|
indexes: Vec<Vec<DocIndex>>,
|
||||||
@ -136,29 +185,27 @@ impl<W: Write> DocIndexesBuilder<W> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn finish(self) -> io::Result<()> {
|
pub fn finish(self) -> io::Result<()> {
|
||||||
self.into_inner().map(|_| ())
|
self.into_inner().map(drop)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn into_inner(mut self) -> io::Result<W> {
|
pub fn into_inner(mut self) -> io::Result<W> {
|
||||||
|
|
||||||
for vec in &mut self.indexes {
|
for vec in &mut self.indexes {
|
||||||
vec.sort_unstable();
|
vec.sort_unstable();
|
||||||
}
|
}
|
||||||
|
|
||||||
let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs);
|
let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs);
|
||||||
|
|
||||||
|
// write values first
|
||||||
|
let slice = unsafe { into_u8_slice(values.as_slice()) };
|
||||||
|
self.wtr.write_all(slice)?;
|
||||||
|
|
||||||
|
// write ranges after
|
||||||
|
let slice = unsafe { into_u8_slice(ranges.as_slice()) };
|
||||||
|
self.wtr.write_all(slice)?;
|
||||||
|
|
||||||
|
// write the length of the ranges
|
||||||
let len = ranges.len() as u64;
|
let len = ranges.len() as u64;
|
||||||
|
|
||||||
// TODO check if this is correct
|
|
||||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||||
unsafe {
|
|
||||||
// write Ranges first
|
|
||||||
let slice = into_u8_slice(ranges.as_slice());
|
|
||||||
self.wtr.write_all(slice)?;
|
|
||||||
|
|
||||||
// write Values after
|
|
||||||
let slice = into_u8_slice(values.as_slice());
|
|
||||||
self.wtr.write_all(slice)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.wtr.flush()?;
|
self.wtr.flush()?;
|
||||||
Ok(self.wtr)
|
Ok(self.wtr)
|
||||||
|
@ -7,7 +7,7 @@ use std::sync::Arc;
|
|||||||
use fst::raw::MmapReadOnly;
|
use fst::raw::MmapReadOnly;
|
||||||
|
|
||||||
pub use self::doc_ids::{DocIds, DocIdsBuilder};
|
pub use self::doc_ids::{DocIds, DocIdsBuilder};
|
||||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder, RawDocIndexesBuilder};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
enum Data {
|
enum Data {
|
||||||
@ -19,6 +19,16 @@ enum Data {
|
|||||||
Mmap(MmapReadOnly),
|
Mmap(MmapReadOnly),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Default for Data {
|
||||||
|
fn default() -> Data {
|
||||||
|
Data::Shared {
|
||||||
|
vec: Arc::default(),
|
||||||
|
offset: 0,
|
||||||
|
len: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Deref for Data {
|
impl Deref for Data {
|
||||||
type Target = [u8];
|
type Target = [u8];
|
||||||
|
|
||||||
|
@ -3,7 +3,6 @@ use std::io::Write;
|
|||||||
use byteorder::{NetworkEndian, WriteBytesExt};
|
use byteorder::{NetworkEndian, WriteBytesExt};
|
||||||
|
|
||||||
use crate::index::schema::SchemaAttr;
|
use crate::index::schema::SchemaAttr;
|
||||||
use crate::blob::BlobName;
|
|
||||||
use crate::DocumentId;
|
use crate::DocumentId;
|
||||||
|
|
||||||
pub struct Identifier {
|
pub struct Identifier {
|
||||||
@ -17,13 +16,6 @@ impl Identifier {
|
|||||||
Data { inner }
|
Data { inner }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn blob(name: BlobName) -> Blob {
|
|
||||||
let mut inner = Vec::new();
|
|
||||||
let _ = inner.write(b"blob");
|
|
||||||
let _ = inner.write(name.as_bytes());
|
|
||||||
Blob { inner }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn document(id: DocumentId) -> Document {
|
pub fn document(id: DocumentId) -> Document {
|
||||||
let mut inner = Vec::new();
|
let mut inner = Vec::new();
|
||||||
let _ = inner.write(b"docu");
|
let _ = inner.write(b"docu");
|
||||||
@ -38,9 +30,9 @@ pub struct Data {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Data {
|
impl Data {
|
||||||
pub fn blobs_order(mut self) -> Self {
|
pub fn index(mut self) -> Self {
|
||||||
let _ = self.inner.write(b"-");
|
let _ = self.inner.write(b"-");
|
||||||
let _ = self.inner.write(b"blobs-order");
|
let _ = self.inner.write(b"index");
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -55,34 +47,6 @@ impl Data {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Blob {
|
|
||||||
inner: Vec<u8>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Blob {
|
|
||||||
pub fn document_indexes(mut self) -> Self {
|
|
||||||
let _ = self.inner.write(b"-");
|
|
||||||
let _ = self.inner.write(b"doc-idx");
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn document_ids(mut self) -> Self {
|
|
||||||
let _ = self.inner.write(b"-");
|
|
||||||
let _ = self.inner.write(b"doc-ids");
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn fst_map(mut self) -> Self {
|
|
||||||
let _ = self.inner.write(b"-");
|
|
||||||
let _ = self.inner.write(b"fst");
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build(self) -> Vec<u8> {
|
|
||||||
self.inner
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Document {
|
pub struct Document {
|
||||||
inner: Vec<u8>,
|
inner: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
182
src/index/mod.rs
182
src/index/mod.rs
@ -2,76 +2,163 @@ pub mod identifier;
|
|||||||
pub mod schema;
|
pub mod schema;
|
||||||
pub mod update;
|
pub mod update;
|
||||||
|
|
||||||
use std::io;
|
|
||||||
use std::rc::Rc;
|
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::fs::{self, File};
|
use std::path::Path;
|
||||||
use std::fmt::{self, Write};
|
|
||||||
use std::ops::{Deref, BitOr};
|
|
||||||
use std::path::{Path, PathBuf};
|
|
||||||
use std::collections::{BTreeSet, BTreeMap};
|
|
||||||
|
|
||||||
use fs2::FileExt;
|
use fst::map::{Map, MapBuilder, OpBuilder};
|
||||||
|
use fst::{IntoStreamer, Streamer};
|
||||||
|
use sdset::duo::Union as SdUnion;
|
||||||
|
use sdset::duo::DifferenceByKey;
|
||||||
|
use sdset::{Set, SetOperation};
|
||||||
use ::rocksdb::rocksdb::Writable;
|
use ::rocksdb::rocksdb::Writable;
|
||||||
use ::rocksdb::{rocksdb, rocksdb_options};
|
use ::rocksdb::{rocksdb, rocksdb_options};
|
||||||
use ::rocksdb::merge_operator::MergeOperands;
|
use ::rocksdb::merge_operator::MergeOperands;
|
||||||
|
|
||||||
|
use crate::DocIndex;
|
||||||
|
use crate::automaton;
|
||||||
use crate::rank::Document;
|
use crate::rank::Document;
|
||||||
use crate::data::DocIdsBuilder;
|
|
||||||
use crate::{DocIndex, DocumentId};
|
|
||||||
use crate::index::schema::Schema;
|
use crate::index::schema::Schema;
|
||||||
use crate::index::update::Update;
|
use crate::index::update::Update;
|
||||||
|
use crate::tokenizer::TokenizerBuilder;
|
||||||
use crate::index::identifier::Identifier;
|
use crate::index::identifier::Identifier;
|
||||||
use crate::blob::{PositiveBlobBuilder, PositiveBlob, BlobInfo, Sign, Blob, blobs_from_blob_infos};
|
|
||||||
use crate::tokenizer::{TokenizerBuilder, DefaultBuilder, Tokenizer};
|
|
||||||
use crate::rank::{criterion, Config, RankedStream};
|
use crate::rank::{criterion, Config, RankedStream};
|
||||||
use crate::automaton;
|
use crate::data::{DocIds, DocIndexes, RawDocIndexesBuilder};
|
||||||
|
use crate::blob::{PositiveBlob, NegativeBlob, Blob};
|
||||||
|
|
||||||
fn simple_vec_append(key: &[u8], value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
fn union_positives(a: &PositiveBlob, b: &PositiveBlob) -> Result<PositiveBlob, Box<Error>> {
|
||||||
let mut output = Vec::new();
|
let (a_map, a_indexes) = (a.as_map(), a.as_indexes());
|
||||||
for bytes in operands.chain(value) {
|
let (b_map, b_indexes) = (b.as_map(), b.as_indexes());
|
||||||
output.extend_from_slice(bytes);
|
|
||||||
|
let mut map_builder = MapBuilder::memory();
|
||||||
|
let mut indexes_builder = RawDocIndexesBuilder::memory();
|
||||||
|
|
||||||
|
let op_builder = OpBuilder::new().add(a_map).add(b_map);
|
||||||
|
let mut stream = op_builder.union();
|
||||||
|
let mut i = 0;
|
||||||
|
|
||||||
|
while let Some((key, indexed)) = stream.next() {
|
||||||
|
let doc_idx: Vec<DocIndex> = match indexed {
|
||||||
|
[a, b] => {
|
||||||
|
let a_doc_idx = a_indexes.get(a.value).expect("BUG: could not find document indexes");
|
||||||
|
let b_doc_idx = b_indexes.get(b.value).expect("BUG: could not find document indexes");
|
||||||
|
|
||||||
|
let a_doc_idx = Set::new_unchecked(a_doc_idx);
|
||||||
|
let b_doc_idx = Set::new_unchecked(b_doc_idx);
|
||||||
|
|
||||||
|
let sd_union = SdUnion::new(a_doc_idx, b_doc_idx);
|
||||||
|
sd_union.into_set_buf().into_vec()
|
||||||
|
},
|
||||||
|
[a] => {
|
||||||
|
let indexes = if a.index == 0 { a_indexes } else { b_indexes };
|
||||||
|
let doc_idx = indexes.get(a.value).expect("BUG: could not find document indexes");
|
||||||
|
doc_idx.to_vec()
|
||||||
|
},
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
|
||||||
|
if !doc_idx.is_empty() {
|
||||||
|
map_builder.insert(key, i)?;
|
||||||
|
indexes_builder.insert(&doc_idx)?;
|
||||||
|
i += 1;
|
||||||
}
|
}
|
||||||
output
|
}
|
||||||
|
|
||||||
|
let inner = map_builder.into_inner()?;
|
||||||
|
let map = Map::from_bytes(inner)?;
|
||||||
|
|
||||||
|
let inner = indexes_builder.into_inner()?;
|
||||||
|
let indexes = DocIndexes::from_bytes(inner)?;
|
||||||
|
|
||||||
|
Ok(PositiveBlob::from_raw(map, indexes))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct MergeBuilder {
|
fn union_negatives(a: &NegativeBlob, b: &NegativeBlob) -> NegativeBlob {
|
||||||
blobs: Vec<Blob>,
|
let a_doc_ids = a.as_ids().doc_ids();
|
||||||
|
let b_doc_ids = b.as_ids().doc_ids();
|
||||||
|
|
||||||
|
let a_doc_ids = Set::new_unchecked(a_doc_ids);
|
||||||
|
let b_doc_ids = Set::new_unchecked(b_doc_ids);
|
||||||
|
|
||||||
|
let sd_union = SdUnion::new(a_doc_ids, b_doc_ids);
|
||||||
|
let doc_ids = sd_union.into_set_buf().into_vec();
|
||||||
|
let doc_ids = DocIds::from_document_ids(doc_ids);
|
||||||
|
|
||||||
|
NegativeBlob::from_raw(doc_ids)
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MergeBuilder {
|
fn merge_positive_negative(pos: &PositiveBlob, neg: &NegativeBlob) -> Result<PositiveBlob, Box<Error>> {
|
||||||
pub fn new() -> MergeBuilder {
|
let (map, indexes) = (pos.as_map(), pos.as_indexes());
|
||||||
MergeBuilder { blobs: Vec::new() }
|
let doc_ids = neg.as_ids().doc_ids();
|
||||||
|
|
||||||
|
let doc_ids = Set::new_unchecked(doc_ids);
|
||||||
|
|
||||||
|
let mut map_builder = MapBuilder::memory();
|
||||||
|
let mut indexes_builder = RawDocIndexesBuilder::memory();
|
||||||
|
|
||||||
|
let mut stream = map.into_stream();
|
||||||
|
let mut i = 0;
|
||||||
|
|
||||||
|
while let Some((key, index)) = stream.next() {
|
||||||
|
let doc_idx = indexes.get(index).expect("BUG: could not find document indexes");
|
||||||
|
let doc_idx = Set::new_unchecked(doc_idx);
|
||||||
|
|
||||||
|
let diff = DifferenceByKey::new(doc_idx, doc_ids, |&d| d.document_id, |id| *id);
|
||||||
|
let doc_idx: Vec<DocIndex> = diff.into_set_buf().into_vec();
|
||||||
|
|
||||||
|
map_builder.insert(key, i)?;
|
||||||
|
indexes_builder.insert(&doc_idx)?;
|
||||||
|
i += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn push(&mut self, blob: Blob) {
|
let inner = map_builder.into_inner()?;
|
||||||
if blob.sign() == Sign::Negative && self.blobs.is_empty() { return }
|
let map = Map::from_bytes(inner)?;
|
||||||
self.blobs.push(blob);
|
|
||||||
|
let inner = indexes_builder.into_inner()?;
|
||||||
|
let indexes = DocIndexes::from_bytes(inner)?;
|
||||||
|
|
||||||
|
Ok(PositiveBlob::from_raw(map, indexes))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
struct Merge {
|
||||||
|
blob: PositiveBlob,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Merge {
|
||||||
|
fn new(blob: PositiveBlob) -> Merge {
|
||||||
|
Merge { blob }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge(self) -> PositiveBlob {
|
fn merge(&mut self, blob: Blob) {
|
||||||
unimplemented!()
|
self.blob = match blob {
|
||||||
|
Blob::Positive(blob) => union_positives(&self.blob, &blob).unwrap(),
|
||||||
|
Blob::Negative(blob) => merge_positive_negative(&self.blob, &blob).unwrap(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build(self) -> PositiveBlob {
|
||||||
|
self.blob
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||||
if key != b"data-index" { panic!("The merge operator only allow \"data-index\" merging") }
|
if key != b"data-index" { panic!("The merge operator only supports \"data-index\" merging") }
|
||||||
|
|
||||||
let mut merge_builder = MergeBuilder::new();
|
let mut merge = match existing_value {
|
||||||
|
Some(existing_value) => {
|
||||||
if let Some(existing_value) = existing_value {
|
let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
|
||||||
let base: PositiveBlob = bincode::deserialize(existing_value).unwrap(); // FIXME what do we do here ?
|
Merge::new(blob)
|
||||||
merge_builder.push(Blob::Positive(base));
|
},
|
||||||
}
|
None => Merge::default(),
|
||||||
|
};
|
||||||
|
|
||||||
for bytes in operands {
|
for bytes in operands {
|
||||||
let blob: Blob = bincode::deserialize(bytes).unwrap();
|
let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blobs");
|
||||||
merge_builder.push(blob);
|
merge.merge(blob);
|
||||||
}
|
}
|
||||||
|
|
||||||
let blob = merge_builder.merge();
|
let blob = merge.build();
|
||||||
// blob.to_vec()
|
bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
|
||||||
unimplemented!()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Index {
|
pub struct Index {
|
||||||
@ -95,7 +182,7 @@ impl Index {
|
|||||||
opts.create_if_missing(true);
|
opts.create_if_missing(true);
|
||||||
|
|
||||||
let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
|
let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
|
||||||
cf_opts.add_merge_operator("blobs order operator", simple_vec_append);
|
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
|
||||||
|
|
||||||
let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
||||||
|
|
||||||
@ -114,7 +201,7 @@ impl Index {
|
|||||||
opts.create_if_missing(false);
|
opts.create_if_missing(false);
|
||||||
|
|
||||||
let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
|
let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
|
||||||
cf_opts.add_merge_operator("blobs order operator", simple_vec_append);
|
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
|
||||||
|
|
||||||
let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
||||||
|
|
||||||
@ -150,12 +237,9 @@ impl Index {
|
|||||||
// this snapshot will allow consistent reads for the whole search operation
|
// this snapshot will allow consistent reads for the whole search operation
|
||||||
let snapshot = self.database.snapshot();
|
let snapshot = self.database.snapshot();
|
||||||
|
|
||||||
let data_key = Identifier::data().blobs_order().build();
|
let index_key = Identifier::data().index().build();
|
||||||
let blobs = match snapshot.get(&data_key)? {
|
let map = match snapshot.get(&index_key)? {
|
||||||
Some(value) => {
|
Some(value) => bincode::deserialize(&value)?,
|
||||||
let blob_infos = BlobInfo::read_from_slice(&value)?;
|
|
||||||
blobs_from_blob_infos(&blob_infos, &snapshot)?
|
|
||||||
},
|
|
||||||
None => Vec::new(),
|
None => Vec::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -166,7 +250,7 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let config = Config {
|
let config = Config {
|
||||||
blobs: &blobs,
|
map: map,
|
||||||
automatons: automatons,
|
automatons: automatons,
|
||||||
criteria: criterion::default(),
|
criteria: criterion::default(),
|
||||||
distinct: ((), 1),
|
distinct: ((), 1),
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
use ::rocksdb::rocksdb_options;
|
|
||||||
|
|
||||||
use crate::blob::{BlobName, Sign};
|
use crate::blob::{BlobName, Sign};
|
||||||
|
|
||||||
mod negative_update;
|
mod negative_update;
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::fmt::Write;
|
|
||||||
|
|
||||||
use ::rocksdb::rocksdb_options;
|
use ::rocksdb::rocksdb_options;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user