mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 09:04:59 +08:00
Merge pull request #696 from meilisearch/reduce-document-id-size
Reduce document id size from 64bits to 32bits
This commit is contained in:
commit
e40d9e7462
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -1624,7 +1624,6 @@ dependencies = [
|
|||||||
"sdset",
|
"sdset",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"siphasher",
|
|
||||||
"slice-group-by",
|
"slice-group-by",
|
||||||
"structopt",
|
"structopt",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
|
@ -35,7 +35,6 @@ regex = "1.3.6"
|
|||||||
sdset = "0.4.0"
|
sdset = "0.4.0"
|
||||||
serde = { version = "1.0.105", features = ["derive"] }
|
serde = { version = "1.0.105", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.50", features = ["preserve_order"] }
|
serde_json = { version = "1.0.50", features = ["preserve_order"] }
|
||||||
siphasher = "0.3.2"
|
|
||||||
slice-group-by = "0.2.6"
|
slice-group-by = "0.2.6"
|
||||||
unicase = "2.6.0"
|
unicase = "2.6.0"
|
||||||
zerocopy = "0.3.0"
|
zerocopy = "0.3.0"
|
||||||
|
@ -775,12 +775,12 @@ mod tests {
|
|||||||
assert!(document.is_none());
|
assert!(document.is_none());
|
||||||
|
|
||||||
let document: Option<IgnoredAny> = index
|
let document: Option<IgnoredAny> = index
|
||||||
.document(&reader, None, DocumentId(7_900_334_843_754_999_545))
|
.document(&reader, None, DocumentId(0))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(document.is_some());
|
assert!(document.is_some());
|
||||||
|
|
||||||
let document: Option<IgnoredAny> = index
|
let document: Option<IgnoredAny> = index
|
||||||
.document(&reader, None, DocumentId(8_367_468_610_878_465_872))
|
.document(&reader, None, DocumentId(1))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(document.is_some());
|
assert!(document.is_some());
|
||||||
}
|
}
|
||||||
@ -855,12 +855,12 @@ mod tests {
|
|||||||
assert!(document.is_none());
|
assert!(document.is_none());
|
||||||
|
|
||||||
let document: Option<IgnoredAny> = index
|
let document: Option<IgnoredAny> = index
|
||||||
.document(&reader, None, DocumentId(7_900_334_843_754_999_545))
|
.document(&reader, None, DocumentId(0))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(document.is_some());
|
assert!(document.is_some());
|
||||||
|
|
||||||
let document: Option<IgnoredAny> = index
|
let document: Option<IgnoredAny> = index
|
||||||
.document(&reader, None, DocumentId(8_367_468_610_878_465_872))
|
.document(&reader, None, DocumentId(1))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(document.is_some());
|
assert!(document.is_some());
|
||||||
|
|
||||||
@ -897,7 +897,7 @@ mod tests {
|
|||||||
|
|
||||||
let reader = db.main_read_txn().unwrap();
|
let reader = db.main_read_txn().unwrap();
|
||||||
let document: Option<serde_json::Value> = index
|
let document: Option<serde_json::Value> = index
|
||||||
.document(&reader, None, DocumentId(7_900_334_843_754_999_545))
|
.document(&reader, None, DocumentId(0))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let new_doc1 = serde_json::json!({
|
let new_doc1 = serde_json::json!({
|
||||||
@ -908,7 +908,7 @@ mod tests {
|
|||||||
assert_eq!(document, Some(new_doc1));
|
assert_eq!(document, Some(new_doc1));
|
||||||
|
|
||||||
let document: Option<serde_json::Value> = index
|
let document: Option<serde_json::Value> = index
|
||||||
.document(&reader, None, DocumentId(8_367_468_610_878_465_872))
|
.document(&reader, None, DocumentId(1))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let new_doc2 = serde_json::json!({
|
let new_doc2 = serde_json::json!({
|
||||||
@ -1080,14 +1080,14 @@ mod tests {
|
|||||||
assert_matches!(
|
assert_matches!(
|
||||||
iter.next(),
|
iter.next(),
|
||||||
Some(Document {
|
Some(Document {
|
||||||
id: DocumentId(7_900_334_843_754_999_545),
|
id: DocumentId(0),
|
||||||
..
|
..
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
assert_matches!(
|
assert_matches!(
|
||||||
iter.next(),
|
iter.next(),
|
||||||
Some(Document {
|
Some(Document {
|
||||||
id: DocumentId(8_367_468_610_878_465_872),
|
id: DocumentId(1),
|
||||||
..
|
..
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
@ -191,6 +191,6 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn docindex_mem_size() {
|
fn docindex_mem_size() {
|
||||||
assert_eq!(mem::size_of::<DocIndex>(), 16);
|
assert_eq!(mem::size_of::<DocIndex>(), 12);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -228,7 +228,7 @@ mod tests {
|
|||||||
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
const fn doc_index(document_id: u64, word_index: u16) -> DocIndex {
|
const fn doc_index(document_id: u32, word_index: u16) -> DocIndex {
|
||||||
DocIndex {
|
DocIndex {
|
||||||
document_id: DocumentId(document_id),
|
document_id: DocumentId(document_id),
|
||||||
attribute: 0,
|
attribute: 0,
|
||||||
@ -238,7 +238,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const fn doc_char_index(document_id: u64, word_index: u16, char_index: u16) -> DocIndex {
|
const fn doc_char_index(document_id: u32, word_index: u16, char_index: u16) -> DocIndex {
|
||||||
DocIndex {
|
DocIndex {
|
||||||
document_id: DocumentId(document_id),
|
document_id: DocumentId(document_id),
|
||||||
attribute: 0,
|
attribute: 0,
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use super::BEU64;
|
use super::BEU32;
|
||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
use crate::DocumentId;
|
use crate::DocumentId;
|
||||||
use heed::types::{ByteSlice, OwnedType};
|
use heed::types::{ByteSlice, OwnedType};
|
||||||
@ -7,7 +7,7 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct DocsWords {
|
pub struct DocsWords {
|
||||||
pub(crate) docs_words: heed::Database<OwnedType<BEU64>, ByteSlice>,
|
pub(crate) docs_words: heed::Database<OwnedType<BEU32>, ByteSlice>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocsWords {
|
impl DocsWords {
|
||||||
@ -17,13 +17,13 @@ impl DocsWords {
|
|||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
words: &fst::Set,
|
words: &fst::Set,
|
||||||
) -> ZResult<()> {
|
) -> ZResult<()> {
|
||||||
let document_id = BEU64::new(document_id.0);
|
let document_id = BEU32::new(document_id.0);
|
||||||
let bytes = words.as_fst().as_bytes();
|
let bytes = words.as_fst().as_bytes();
|
||||||
self.docs_words.put(writer, &document_id, bytes)
|
self.docs_words.put(writer, &document_id, bytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn del_doc_words(self, writer: &mut heed::RwTxn<MainT>, document_id: DocumentId) -> ZResult<bool> {
|
pub fn del_doc_words(self, writer: &mut heed::RwTxn<MainT>, document_id: DocumentId) -> ZResult<bool> {
|
||||||
let document_id = BEU64::new(document_id.0);
|
let document_id = BEU32::new(document_id.0);
|
||||||
self.docs_words.delete(writer, &document_id)
|
self.docs_words.delete(writer, &document_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -36,7 +36,7 @@ impl DocsWords {
|
|||||||
reader: &heed::RoTxn<MainT>,
|
reader: &heed::RoTxn<MainT>,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
) -> ZResult<Option<fst::Set>> {
|
) -> ZResult<Option<fst::Set>> {
|
||||||
let document_id = BEU64::new(document_id.0);
|
let document_id = BEU32::new(document_id.0);
|
||||||
match self.docs_words.get(reader, &document_id)? {
|
match self.docs_words.get(reader, &document_id)? {
|
||||||
Some(bytes) => {
|
Some(bytes) => {
|
||||||
let len = bytes.len();
|
let len = bytes.len();
|
||||||
|
75
meilisearch-core/src/store/documents_ids.rs
Normal file
75
meilisearch-core/src/store/documents_ids.rs
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use heed::{BytesDecode, BytesEncode};
|
||||||
|
use sdset::Set;
|
||||||
|
|
||||||
|
use crate::DocumentId;
|
||||||
|
use super::cow_set::CowSet;
|
||||||
|
|
||||||
|
pub struct DocumentsIds;
|
||||||
|
|
||||||
|
impl BytesEncode<'_> for DocumentsIds {
|
||||||
|
type EItem = Set<DocumentId>;
|
||||||
|
|
||||||
|
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
CowSet::bytes_encode(item)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> BytesDecode<'a> for DocumentsIds {
|
||||||
|
type DItem = Cow<'a, Set<DocumentId>>;
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
CowSet::bytes_decode(bytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DiscoverIds<'a> {
|
||||||
|
ids_iter: std::slice::Iter<'a, DocumentId>,
|
||||||
|
left_id: Option<u32>,
|
||||||
|
right_id: Option<u32>,
|
||||||
|
available_range: std::ops::Range<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DiscoverIds<'_> {
|
||||||
|
pub fn new(ids: &Set<DocumentId>) -> DiscoverIds {
|
||||||
|
let mut ids_iter = ids.iter();
|
||||||
|
let right_id = ids_iter.next().map(|id| id.0);
|
||||||
|
let available_range = 0..right_id.unwrap_or(u32::max_value());
|
||||||
|
DiscoverIds { ids_iter, left_id: None, right_id, available_range }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for DiscoverIds<'_> {
|
||||||
|
type Item = DocumentId;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
loop {
|
||||||
|
match self.available_range.next() {
|
||||||
|
// The available range gives us a new id, we return it.
|
||||||
|
Some(id) => return Some(DocumentId(id)),
|
||||||
|
// The available range is exhausted, we need to find the next one.
|
||||||
|
None if self.available_range.end == u32::max_value() => return None,
|
||||||
|
None => loop {
|
||||||
|
self.left_id = self.right_id.take();
|
||||||
|
self.right_id = self.ids_iter.next().map(|id| id.0);
|
||||||
|
match (self.left_id, self.right_id) {
|
||||||
|
// We found a gap in the used ids, we can yield all ids
|
||||||
|
// until the end of the gap
|
||||||
|
(Some(l), Some(r)) => if l.saturating_add(1) != r {
|
||||||
|
self.available_range = (l + 1)..r;
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
// The last used id has been reached, we can use all ids
|
||||||
|
// until u32 MAX
|
||||||
|
(Some(l), None) => {
|
||||||
|
self.available_range = l.saturating_add(1)..u32::max_value();
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -3,28 +3,31 @@ use std::sync::Arc;
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str};
|
|
||||||
use heed::Result as ZResult;
|
use heed::Result as ZResult;
|
||||||
|
use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str};
|
||||||
use meilisearch_schema::{FieldId, Schema};
|
use meilisearch_schema::{FieldId, Schema};
|
||||||
|
use meilisearch_types::DocumentId;
|
||||||
use sdset::Set;
|
use sdset::Set;
|
||||||
|
|
||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
use crate::RankedMap;
|
use crate::RankedMap;
|
||||||
use crate::settings::RankingRule;
|
use crate::settings::RankingRule;
|
||||||
use super::cow_set::CowSet;
|
use super::{CowSet, DocumentsIds};
|
||||||
|
|
||||||
|
const ATTRIBUTES_FOR_FACETING_KEY: &str = "attributes-for-faceting";
|
||||||
const CREATED_AT_KEY: &str = "created-at";
|
const CREATED_AT_KEY: &str = "created-at";
|
||||||
const ATTRIBUTES_FOR_FACETING: &str = "attributes-for-faceting";
|
|
||||||
const RANKING_RULES_KEY: &str = "ranking-rules";
|
|
||||||
const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute";
|
|
||||||
const STOP_WORDS_KEY: &str = "stop-words";
|
|
||||||
const SYNONYMS_KEY: &str = "synonyms";
|
|
||||||
const CUSTOMS_KEY: &str = "customs";
|
const CUSTOMS_KEY: &str = "customs";
|
||||||
|
const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute";
|
||||||
|
const EXTERNAL_DOCIDS_KEY: &str = "external-docids";
|
||||||
const FIELDS_FREQUENCY_KEY: &str = "fields-frequency";
|
const FIELDS_FREQUENCY_KEY: &str = "fields-frequency";
|
||||||
|
const INTERNAL_DOCIDS_KEY: &str = "internal-docids";
|
||||||
const NAME_KEY: &str = "name";
|
const NAME_KEY: &str = "name";
|
||||||
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
|
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
|
||||||
const RANKED_MAP_KEY: &str = "ranked-map";
|
const RANKED_MAP_KEY: &str = "ranked-map";
|
||||||
|
const RANKING_RULES_KEY: &str = "ranking-rules";
|
||||||
const SCHEMA_KEY: &str = "schema";
|
const SCHEMA_KEY: &str = "schema";
|
||||||
|
const STOP_WORDS_KEY: &str = "stop-words";
|
||||||
|
const SYNONYMS_KEY: &str = "synonyms";
|
||||||
const UPDATED_AT_KEY: &str = "updated-at";
|
const UPDATED_AT_KEY: &str = "updated-at";
|
||||||
const WORDS_KEY: &str = "words";
|
const WORDS_KEY: &str = "words";
|
||||||
|
|
||||||
@ -71,9 +74,90 @@ impl Main {
|
|||||||
self.main.get::<_, Str, SerdeDatetime>(reader, UPDATED_AT_KEY)
|
self.main.get::<_, Str, SerdeDatetime>(reader, UPDATED_AT_KEY)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn put_internal_docids(self, writer: &mut heed::RwTxn<MainT>, ids: &sdset::Set<DocumentId>) -> ZResult<()> {
|
||||||
|
self.main.put::<_, Str, DocumentsIds>(writer, INTERNAL_DOCIDS_KEY, ids)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn internal_docids<'txn>(self, reader: &'txn heed::RoTxn<MainT>) -> ZResult<Cow<'txn, sdset::Set<DocumentId>>> {
|
||||||
|
match self.main.get::<_, Str, DocumentsIds>(reader, INTERNAL_DOCIDS_KEY)? {
|
||||||
|
Some(ids) => Ok(ids),
|
||||||
|
None => Ok(Cow::default()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn merge_internal_docids(self, writer: &mut heed::RwTxn<MainT>, new_ids: &sdset::Set<DocumentId>) -> ZResult<()> {
|
||||||
|
use sdset::SetOperation;
|
||||||
|
|
||||||
|
// We do an union of the old and new internal ids.
|
||||||
|
let internal_docids = self.internal_docids(writer)?;
|
||||||
|
let internal_docids = sdset::duo::Union::new(&internal_docids, new_ids).into_set_buf();
|
||||||
|
self.put_internal_docids(writer, &internal_docids)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn remove_internal_docids(self, writer: &mut heed::RwTxn<MainT>, ids: &sdset::Set<DocumentId>) -> ZResult<()> {
|
||||||
|
use sdset::SetOperation;
|
||||||
|
|
||||||
|
// We do a difference of the old and new internal ids.
|
||||||
|
let internal_docids = self.internal_docids(writer)?;
|
||||||
|
let internal_docids = sdset::duo::Difference::new(&internal_docids, ids).into_set_buf();
|
||||||
|
self.put_internal_docids(writer, &internal_docids)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn put_external_docids(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map) -> ZResult<()> {
|
||||||
|
self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, ids.as_fst().as_bytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn merge_external_docids(self, writer: &mut heed::RwTxn<MainT>, new_docids: &fst::Map) -> ZResult<()> {
|
||||||
|
use fst::{Streamer, IntoStreamer};
|
||||||
|
|
||||||
|
// Do an union of the old and the new set of external docids.
|
||||||
|
let external_docids = self.external_docids(writer)?;
|
||||||
|
let mut op = external_docids.op().add(new_docids.into_stream()).r#union();
|
||||||
|
let mut build = fst::MapBuilder::memory();
|
||||||
|
while let Some((docid, values)) = op.next() {
|
||||||
|
build.insert(docid, values[0].value).unwrap();
|
||||||
|
}
|
||||||
|
let external_docids = build.into_inner().unwrap();
|
||||||
|
|
||||||
|
// TODO prefer using self.put_user_ids
|
||||||
|
self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, external_docids.as_slice())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn remove_external_docids(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map) -> ZResult<()> {
|
||||||
|
use fst::{Streamer, IntoStreamer};
|
||||||
|
|
||||||
|
// Do an union of the old and the new set of external docids.
|
||||||
|
let external_docids = self.external_docids(writer)?;
|
||||||
|
let mut op = external_docids.op().add(ids.into_stream()).difference();
|
||||||
|
let mut build = fst::MapBuilder::memory();
|
||||||
|
while let Some((docid, values)) = op.next() {
|
||||||
|
build.insert(docid, values[0].value).unwrap();
|
||||||
|
}
|
||||||
|
let external_docids = build.into_inner().unwrap();
|
||||||
|
|
||||||
|
// TODO prefer using self.put_external_docids
|
||||||
|
self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, external_docids.as_slice())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn external_docids(self, reader: &heed::RoTxn<MainT>) -> ZResult<fst::Map> {
|
||||||
|
match self.main.get::<_, Str, ByteSlice>(reader, EXTERNAL_DOCIDS_KEY)? {
|
||||||
|
Some(bytes) => {
|
||||||
|
let len = bytes.len();
|
||||||
|
let bytes = Arc::new(bytes.to_owned());
|
||||||
|
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||||
|
Ok(fst::Map::from(fst))
|
||||||
|
},
|
||||||
|
None => Ok(fst::Map::default()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn external_to_internal_docid(self, reader: &heed::RoTxn<MainT>, external_docid: &str) -> ZResult<Option<DocumentId>> {
|
||||||
|
let external_ids = self.external_docids(reader)?;
|
||||||
|
Ok(external_ids.get(external_docid).map(|id| DocumentId(id as u32)))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn put_words_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> {
|
pub fn put_words_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> {
|
||||||
let bytes = fst.as_fst().as_bytes();
|
self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, fst.as_fst().as_bytes())
|
||||||
self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, bytes)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub unsafe fn static_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
pub unsafe fn static_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
||||||
@ -82,7 +166,7 @@ impl Main {
|
|||||||
let bytes: &'static [u8] = std::mem::transmute(bytes);
|
let bytes: &'static [u8] = std::mem::transmute(bytes);
|
||||||
let set = fst::Set::from_static_slice(bytes).unwrap();
|
let set = fst::Set::from_static_slice(bytes).unwrap();
|
||||||
Ok(Some(set))
|
Ok(Some(set))
|
||||||
}
|
},
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -94,7 +178,7 @@ impl Main {
|
|||||||
let bytes = Arc::new(bytes.to_owned());
|
let bytes = Arc::new(bytes.to_owned());
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||||
Ok(Some(fst::Set::from(fst)))
|
Ok(Some(fst::Set::from(fst)))
|
||||||
}
|
},
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -193,15 +277,15 @@ impl Main {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn attributes_for_faceting<'txn>(&self, reader: &'txn heed::RoTxn<MainT>) -> ZResult<Option<Cow<'txn, Set<FieldId>>>> {
|
pub fn attributes_for_faceting<'txn>(&self, reader: &'txn heed::RoTxn<MainT>) -> ZResult<Option<Cow<'txn, Set<FieldId>>>> {
|
||||||
self.main.get::<_, Str, CowSet<FieldId>>(reader, ATTRIBUTES_FOR_FACETING)
|
self.main.get::<_, Str, CowSet<FieldId>>(reader, ATTRIBUTES_FOR_FACETING_KEY)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_attributes_for_faceting(self, writer: &mut heed::RwTxn<MainT>, attributes: &Set<FieldId>) -> ZResult<()> {
|
pub fn put_attributes_for_faceting(self, writer: &mut heed::RwTxn<MainT>, attributes: &Set<FieldId>) -> ZResult<()> {
|
||||||
self.main.put::<_, Str, CowSet<FieldId>>(writer, ATTRIBUTES_FOR_FACETING, attributes)
|
self.main.put::<_, Str, CowSet<FieldId>>(writer, ATTRIBUTES_FOR_FACETING_KEY, attributes)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn delete_attributes_for_faceting(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<bool> {
|
pub fn delete_attributes_for_faceting(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<bool> {
|
||||||
self.main.delete::<_, Str>(writer, ATTRIBUTES_FOR_FACETING)
|
self.main.delete::<_, Str>(writer, ATTRIBUTES_FOR_FACETING_KEY)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn ranking_rules(&self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<Vec<RankingRule>>> {
|
pub fn ranking_rules(&self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<Vec<RankingRule>>> {
|
||||||
|
@ -1,26 +1,27 @@
|
|||||||
mod cow_set;
|
mod cow_set;
|
||||||
mod docs_words;
|
mod docs_words;
|
||||||
mod prefix_documents_cache;
|
mod documents_ids;
|
||||||
mod prefix_postings_lists_cache;
|
|
||||||
mod documents_fields;
|
mod documents_fields;
|
||||||
mod documents_fields_counts;
|
mod documents_fields_counts;
|
||||||
|
mod facets;
|
||||||
mod main;
|
mod main;
|
||||||
mod postings_lists;
|
mod postings_lists;
|
||||||
|
mod prefix_documents_cache;
|
||||||
|
mod prefix_postings_lists_cache;
|
||||||
mod synonyms;
|
mod synonyms;
|
||||||
mod updates;
|
mod updates;
|
||||||
mod updates_results;
|
mod updates_results;
|
||||||
mod facets;
|
|
||||||
|
|
||||||
|
pub use self::cow_set::CowSet;
|
||||||
pub use self::docs_words::DocsWords;
|
pub use self::docs_words::DocsWords;
|
||||||
pub use self::facets::Facets;
|
|
||||||
pub use self::prefix_documents_cache::PrefixDocumentsCache;
|
|
||||||
pub use self::prefix_postings_lists_cache::PrefixPostingsListsCache;
|
|
||||||
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
|
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
|
||||||
pub use self::documents_fields_counts::{
|
pub use self::documents_fields_counts::{DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter};
|
||||||
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
|
pub use self::documents_ids::{DocumentsIds, DiscoverIds};
|
||||||
};
|
pub use self::facets::Facets;
|
||||||
pub use self::main::Main;
|
pub use self::main::Main;
|
||||||
pub use self::postings_lists::PostingsLists;
|
pub use self::postings_lists::PostingsLists;
|
||||||
|
pub use self::prefix_documents_cache::PrefixDocumentsCache;
|
||||||
|
pub use self::prefix_postings_lists_cache::PrefixPostingsListsCache;
|
||||||
pub use self::synonyms::Synonyms;
|
pub use self::synonyms::Synonyms;
|
||||||
pub use self::updates::Updates;
|
pub use self::updates::Updates;
|
||||||
pub use self::updates_results::UpdatesResults;
|
pub use self::updates_results::UpdatesResults;
|
||||||
@ -44,20 +45,21 @@ use crate::serde::Deserializer;
|
|||||||
use crate::settings::SettingsUpdate;
|
use crate::settings::SettingsUpdate;
|
||||||
use crate::{query_builder::QueryBuilder, update, DocIndex, DocumentId, Error, MResult};
|
use crate::{query_builder::QueryBuilder, update, DocIndex, DocumentId, Error, MResult};
|
||||||
|
|
||||||
|
type BEU32 = zerocopy::U32<byteorder::BigEndian>;
|
||||||
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
||||||
pub type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
pub type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
|
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
pub struct DocumentFieldIndexedKey {
|
pub struct DocumentFieldIndexedKey {
|
||||||
docid: BEU64,
|
docid: BEU32,
|
||||||
indexed_pos: BEU16,
|
indexed_pos: BEU16,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocumentFieldIndexedKey {
|
impl DocumentFieldIndexedKey {
|
||||||
fn new(docid: DocumentId, indexed_pos: IndexedPos) -> DocumentFieldIndexedKey {
|
fn new(docid: DocumentId, indexed_pos: IndexedPos) -> DocumentFieldIndexedKey {
|
||||||
DocumentFieldIndexedKey {
|
DocumentFieldIndexedKey {
|
||||||
docid: BEU64::new(docid.0),
|
docid: BEU32::new(docid.0),
|
||||||
indexed_pos: BEU16::new(indexed_pos.0),
|
indexed_pos: BEU16::new(indexed_pos.0),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -66,14 +68,14 @@ impl DocumentFieldIndexedKey {
|
|||||||
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
|
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
pub struct DocumentFieldStoredKey {
|
pub struct DocumentFieldStoredKey {
|
||||||
docid: BEU64,
|
docid: BEU32,
|
||||||
field_id: BEU16,
|
field_id: BEU16,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocumentFieldStoredKey {
|
impl DocumentFieldStoredKey {
|
||||||
fn new(docid: DocumentId, field_id: FieldId) -> DocumentFieldStoredKey {
|
fn new(docid: DocumentId, field_id: FieldId) -> DocumentFieldStoredKey {
|
||||||
DocumentFieldStoredKey {
|
DocumentFieldStoredKey {
|
||||||
docid: BEU64::new(docid.0),
|
docid: BEU32::new(docid.0),
|
||||||
field_id: BEU16::new(field_id.0),
|
field_id: BEU16::new(field_id.0),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -97,7 +99,7 @@ impl<'a> BytesEncode<'a> for PostingsCodec {
|
|||||||
|
|
||||||
let mut buffer = Vec::with_capacity(u64_size + docids_size + matches_size);
|
let mut buffer = Vec::with_capacity(u64_size + docids_size + matches_size);
|
||||||
|
|
||||||
let docids_len = item.docids.len();
|
let docids_len = item.docids.len() as u64;
|
||||||
buffer.extend_from_slice(&docids_len.to_be_bytes());
|
buffer.extend_from_slice(&docids_len.to_be_bytes());
|
||||||
buffer.extend_from_slice(item.docids.as_bytes());
|
buffer.extend_from_slice(item.docids.as_bytes());
|
||||||
buffer.extend_from_slice(item.matches.as_bytes());
|
buffer.extend_from_slice(item.matches.as_bytes());
|
||||||
|
@ -4,7 +4,7 @@ use heed::types::{OwnedType, CowSlice};
|
|||||||
use heed::Result as ZResult;
|
use heed::Result as ZResult;
|
||||||
use zerocopy::{AsBytes, FromBytes};
|
use zerocopy::{AsBytes, FromBytes};
|
||||||
|
|
||||||
use super::BEU64;
|
use super::{BEU64, BEU32};
|
||||||
use crate::{DocumentId, Highlight};
|
use crate::{DocumentId, Highlight};
|
||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
|
|
||||||
@ -13,15 +13,15 @@ use crate::database::MainT;
|
|||||||
pub struct PrefixKey {
|
pub struct PrefixKey {
|
||||||
prefix: [u8; 4],
|
prefix: [u8; 4],
|
||||||
index: BEU64,
|
index: BEU64,
|
||||||
docid: BEU64,
|
docid: BEU32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PrefixKey {
|
impl PrefixKey {
|
||||||
pub fn new(prefix: [u8; 4], index: u64, docid: u64) -> PrefixKey {
|
pub fn new(prefix: [u8; 4], index: u64, docid: u32) -> PrefixKey {
|
||||||
PrefixKey {
|
PrefixKey {
|
||||||
prefix,
|
prefix,
|
||||||
index: BEU64::new(index),
|
index: BEU64::new(index),
|
||||||
docid: BEU64::new(docid),
|
docid: BEU32::new(docid),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -54,7 +54,7 @@ impl PrefixDocumentsCache {
|
|||||||
prefix: [u8; 4],
|
prefix: [u8; 4],
|
||||||
) -> ZResult<PrefixDocumentsIter<'txn>> {
|
) -> ZResult<PrefixDocumentsIter<'txn>> {
|
||||||
let start = PrefixKey::new(prefix, 0, 0);
|
let start = PrefixKey::new(prefix, 0, 0);
|
||||||
let end = PrefixKey::new(prefix, u64::max_value(), u64::max_value());
|
let end = PrefixKey::new(prefix, u64::max_value(), u32::max_value());
|
||||||
let iter = self.prefix_documents_cache.range(reader, &(start..=end))?;
|
let iter = self.prefix_documents_cache.range(reader, &(start..=end))?;
|
||||||
Ok(PrefixDocumentsIter { iter })
|
Ok(PrefixDocumentsIter { iter })
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,8 @@ pub fn apply_clear_all(
|
|||||||
index: &store::Index,
|
index: &store::Index,
|
||||||
) -> MResult<()> {
|
) -> MResult<()> {
|
||||||
index.main.put_words_fst(writer, &fst::Set::default())?;
|
index.main.put_words_fst(writer, &fst::Set::default())?;
|
||||||
|
index.main.put_external_docids(writer, &fst::Map::default())?;
|
||||||
|
index.main.put_internal_docids(writer, &sdset::SetBuf::default())?;
|
||||||
index.main.put_ranked_map(writer, &RankedMap::default())?;
|
index.main.put_ranked_map(writer, &RankedMap::default())?;
|
||||||
index.main.put_number_of_documents(writer, |_| 0)?;
|
index.main.put_number_of_documents(writer, |_| 0)?;
|
||||||
index.documents_fields.clear(writer)?;
|
index.documents_fields.clear(writer)?;
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, BTreeMap};
|
||||||
|
|
||||||
use fst::{set::OpBuilder, SetBuilder};
|
use fst::{set::OpBuilder, SetBuilder};
|
||||||
use indexmap::IndexMap;
|
use indexmap::IndexMap;
|
||||||
@ -13,7 +13,7 @@ use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
|||||||
use crate::facets;
|
use crate::facets;
|
||||||
use crate::raw_indexer::RawIndexer;
|
use crate::raw_indexer::RawIndexer;
|
||||||
use crate::serde::Deserializer;
|
use crate::serde::Deserializer;
|
||||||
use crate::store::{self, DocumentsFields, DocumentsFieldsCounts};
|
use crate::store::{self, DocumentsFields, DocumentsFieldsCounts, DiscoverIds};
|
||||||
use crate::update::helpers::{index_value, value_to_number, extract_document_id};
|
use crate::update::helpers::{index_value, value_to_number, extract_document_id};
|
||||||
use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update};
|
use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update};
|
||||||
use crate::{Error, MResult, RankedMap};
|
use crate::{Error, MResult, RankedMap};
|
||||||
@ -148,23 +148,40 @@ pub fn apply_addition<'a, 'b>(
|
|||||||
index: &store::Index,
|
index: &store::Index,
|
||||||
new_documents: Vec<IndexMap<String, Value>>,
|
new_documents: Vec<IndexMap<String, Value>>,
|
||||||
partial: bool
|
partial: bool
|
||||||
) -> MResult<()> {
|
) -> MResult<()>
|
||||||
let mut documents_additions = HashMap::new();
|
{
|
||||||
|
|
||||||
let mut schema = match index.main.schema(writer)? {
|
let mut schema = match index.main.schema(writer)? {
|
||||||
Some(schema) => schema,
|
Some(schema) => schema,
|
||||||
None => return Err(Error::SchemaMissing),
|
None => return Err(Error::SchemaMissing),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Retrieve the documents ids related structures
|
||||||
|
let external_docids = index.main.external_docids(writer)?;
|
||||||
|
let internal_docids = index.main.internal_docids(writer)?;
|
||||||
|
let mut available_ids = DiscoverIds::new(&internal_docids);
|
||||||
|
|
||||||
let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?;
|
let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?;
|
||||||
|
|
||||||
// 1. store documents ids for future deletion
|
// 1. store documents ids for future deletion
|
||||||
|
let mut documents_additions = HashMap::new();
|
||||||
|
let mut new_external_docids = BTreeMap::new();
|
||||||
|
let mut new_internal_docids = Vec::with_capacity(new_documents.len());
|
||||||
|
|
||||||
for mut document in new_documents {
|
for mut document in new_documents {
|
||||||
let document_id = extract_document_id(&primary_key, &document)?;
|
let (internal_docid, external_docid) =
|
||||||
|
extract_document_id(
|
||||||
|
&primary_key,
|
||||||
|
&document,
|
||||||
|
&external_docids,
|
||||||
|
&mut available_ids,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
new_external_docids.insert(external_docid, internal_docid.0);
|
||||||
|
new_internal_docids.push(internal_docid);
|
||||||
|
|
||||||
if partial {
|
if partial {
|
||||||
let mut deserializer = Deserializer {
|
let mut deserializer = Deserializer {
|
||||||
document_id,
|
document_id: internal_docid,
|
||||||
reader: writer,
|
reader: writer,
|
||||||
documents_fields: index.documents_fields,
|
documents_fields: index.documents_fields,
|
||||||
schema: &schema,
|
schema: &schema,
|
||||||
@ -178,12 +195,12 @@ pub fn apply_addition<'a, 'b>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
documents_additions.insert(document_id, document);
|
documents_additions.insert(internal_docid, document);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. remove the documents posting lists
|
// 2. remove the documents postings lists
|
||||||
let number_of_inserted_documents = documents_additions.len();
|
let number_of_inserted_documents = documents_additions.len();
|
||||||
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
|
let documents_ids = new_external_docids.iter().map(|(id, _)| id.clone()).collect();
|
||||||
apply_documents_deletion(writer, index, documents_ids)?;
|
apply_documents_deletion(writer, index, documents_ids)?;
|
||||||
|
|
||||||
let mut ranked_map = match index.main.ranked_map(writer)? {
|
let mut ranked_map = match index.main.ranked_map(writer)? {
|
||||||
@ -233,6 +250,11 @@ pub fn apply_addition<'a, 'b>(
|
|||||||
|
|
||||||
index.main.put_schema(writer, &schema)?;
|
index.main.put_schema(writer, &schema)?;
|
||||||
|
|
||||||
|
let new_external_docids = fst::Map::from_iter(new_external_docids.iter().map(|(ext, id)| (ext, *id as u64)))?;
|
||||||
|
let new_internal_docids = sdset::SetBuf::from_dirty(new_internal_docids);
|
||||||
|
index.main.merge_external_docids(writer, &new_external_docids)?;
|
||||||
|
index.main.merge_internal_docids(writer, &new_internal_docids)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ pub struct DocumentsDeletion {
|
|||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
updates_notifier: UpdateEventsEmitter,
|
updates_notifier: UpdateEventsEmitter,
|
||||||
documents: Vec<DocumentId>,
|
external_docids: Vec<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocumentsDeletion {
|
impl DocumentsDeletion {
|
||||||
@ -27,12 +27,12 @@ impl DocumentsDeletion {
|
|||||||
updates_store,
|
updates_store,
|
||||||
updates_results_store,
|
updates_results_store,
|
||||||
updates_notifier,
|
updates_notifier,
|
||||||
documents: Vec::new(),
|
external_docids: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn delete_document_by_id(&mut self, document_id: DocumentId) {
|
pub fn delete_document_by_external_docid(&mut self, document_id: String) {
|
||||||
self.documents.push(document_id);
|
self.external_docids.push(document_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn finalize(self, writer: &mut heed::RwTxn<UpdateT>) -> MResult<u64> {
|
pub fn finalize(self, writer: &mut heed::RwTxn<UpdateT>) -> MResult<u64> {
|
||||||
@ -41,15 +41,15 @@ impl DocumentsDeletion {
|
|||||||
writer,
|
writer,
|
||||||
self.updates_store,
|
self.updates_store,
|
||||||
self.updates_results_store,
|
self.updates_results_store,
|
||||||
self.documents,
|
self.external_docids,
|
||||||
)?;
|
)?;
|
||||||
Ok(update_id)
|
Ok(update_id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Extend<DocumentId> for DocumentsDeletion {
|
impl Extend<String> for DocumentsDeletion {
|
||||||
fn extend<T: IntoIterator<Item = DocumentId>>(&mut self, iter: T) {
|
fn extend<T: IntoIterator<Item=String>>(&mut self, iter: T) {
|
||||||
self.documents.extend(iter)
|
self.external_docids.extend(iter)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,11 +57,11 @@ pub fn push_documents_deletion(
|
|||||||
writer: &mut heed::RwTxn<UpdateT>,
|
writer: &mut heed::RwTxn<UpdateT>,
|
||||||
updates_store: store::Updates,
|
updates_store: store::Updates,
|
||||||
updates_results_store: store::UpdatesResults,
|
updates_results_store: store::UpdatesResults,
|
||||||
deletion: Vec<DocumentId>,
|
external_docids: Vec<String>,
|
||||||
) -> MResult<u64> {
|
) -> MResult<u64> {
|
||||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
||||||
|
|
||||||
let update = Update::documents_deletion(deletion);
|
let update = Update::documents_deletion(external_docids);
|
||||||
updates_store.put_update(writer, last_update_id, &update)?;
|
updates_store.put_update(writer, last_update_id, &update)?;
|
||||||
|
|
||||||
Ok(last_update_id)
|
Ok(last_update_id)
|
||||||
@ -70,8 +70,24 @@ pub fn push_documents_deletion(
|
|||||||
pub fn apply_documents_deletion(
|
pub fn apply_documents_deletion(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
index: &store::Index,
|
index: &store::Index,
|
||||||
deletion: Vec<DocumentId>,
|
external_docids: Vec<String>,
|
||||||
) -> MResult<()> {
|
) -> MResult<()>
|
||||||
|
{
|
||||||
|
let (external_docids, internal_docids) = {
|
||||||
|
let new_external_docids = SetBuf::from_dirty(external_docids);
|
||||||
|
let mut internal_docids = Vec::new();
|
||||||
|
|
||||||
|
let old_external_docids = index.main.external_docids(writer)?;
|
||||||
|
for external_docid in new_external_docids.as_slice() {
|
||||||
|
if let Some(id) = old_external_docids.get(external_docid) {
|
||||||
|
internal_docids.push(DocumentId(id as u32));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let new_external_docids = fst::Map::from_iter(new_external_docids.into_iter().map(|k| (k, 0))).unwrap();
|
||||||
|
(new_external_docids, SetBuf::from_dirty(internal_docids))
|
||||||
|
};
|
||||||
|
|
||||||
let schema = match index.main.schema(writer)? {
|
let schema = match index.main.schema(writer)? {
|
||||||
Some(schema) => schema,
|
Some(schema) => schema,
|
||||||
None => return Err(Error::SchemaMissing),
|
None => return Err(Error::SchemaMissing),
|
||||||
@ -84,16 +100,15 @@ pub fn apply_documents_deletion(
|
|||||||
|
|
||||||
// facet filters deletion
|
// facet filters deletion
|
||||||
if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
|
if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
|
||||||
let facet_map = facets::facet_map_from_docids(writer, &index, &deletion, &attributes_for_facetting)?;
|
let facet_map = facets::facet_map_from_docids(writer, &index, &internal_docids, &attributes_for_facetting)?;
|
||||||
index.facets.remove(writer, facet_map)?;
|
index.facets.remove(writer, facet_map)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// collect the ranked attributes according to the schema
|
// collect the ranked attributes according to the schema
|
||||||
let ranked_fields = schema.ranked();
|
let ranked_fields = schema.ranked();
|
||||||
|
|
||||||
let idset = SetBuf::from_dirty(deletion);
|
|
||||||
let mut words_document_ids = HashMap::new();
|
let mut words_document_ids = HashMap::new();
|
||||||
for id in idset {
|
for id in internal_docids.iter().cloned() {
|
||||||
// remove all the ranked attributes from the ranked_map
|
// remove all the ranked attributes from the ranked_map
|
||||||
for ranked_attr in ranked_fields {
|
for ranked_attr in ranked_fields {
|
||||||
ranked_map.remove(id, *ranked_attr);
|
ranked_map.remove(id, *ranked_attr);
|
||||||
@ -163,6 +178,10 @@ pub fn apply_documents_deletion(
|
|||||||
index.main.put_ranked_map(writer, &ranked_map)?;
|
index.main.put_ranked_map(writer, &ranked_map)?;
|
||||||
index.main.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
index.main.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
||||||
|
|
||||||
|
// We apply the changes to the user and internal ids
|
||||||
|
index.main.remove_external_docids(writer, &external_docids)?;
|
||||||
|
index.main.remove_internal_docids(writer, &internal_docids)?;
|
||||||
|
|
||||||
compute_short_prefixes(writer, index)?;
|
compute_short_prefixes(writer, index)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -1,16 +1,15 @@
|
|||||||
use std::fmt::Write as _;
|
use std::fmt::Write as _;
|
||||||
use std::hash::{Hash, Hasher};
|
|
||||||
|
|
||||||
use indexmap::IndexMap;
|
use indexmap::IndexMap;
|
||||||
use meilisearch_schema::IndexedPos;
|
use meilisearch_schema::IndexedPos;
|
||||||
use meilisearch_types::DocumentId;
|
use meilisearch_types::DocumentId;
|
||||||
use ordered_float::OrderedFloat;
|
use ordered_float::OrderedFloat;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use siphasher::sip::SipHasher;
|
|
||||||
|
|
||||||
|
use crate::Number;
|
||||||
use crate::raw_indexer::RawIndexer;
|
use crate::raw_indexer::RawIndexer;
|
||||||
use crate::serde::SerializerError;
|
use crate::serde::SerializerError;
|
||||||
use crate::Number;
|
use crate::store::DiscoverIds;
|
||||||
|
|
||||||
/// Returns the number of words indexed or `None` if the type is unindexable.
|
/// Returns the number of words indexed or `None` if the type is unindexable.
|
||||||
pub fn index_value(
|
pub fn index_value(
|
||||||
@ -96,28 +95,43 @@ pub fn value_to_number(value: &Value) -> Option<Number> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Validates a string representation to be a correct document id and
|
/// Validates a string representation to be a correct document id and returns
|
||||||
/// returns the hash of the given type, this is the way we produce documents ids.
|
/// the corresponding id or generate a new one, this is the way we produce documents ids.
|
||||||
pub fn compute_document_id(string: &str) -> Result<DocumentId, SerializerError> {
|
pub fn discover_document_id(
|
||||||
if string.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') {
|
docid: &str,
|
||||||
let mut s = SipHasher::new();
|
external_docids: &fst::Map,
|
||||||
string.hash(&mut s);
|
available_docids: &mut DiscoverIds<'_>,
|
||||||
Ok(DocumentId(s.finish()))
|
) -> Result<DocumentId, SerializerError>
|
||||||
|
{
|
||||||
|
if docid.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') {
|
||||||
|
match external_docids.get(docid) {
|
||||||
|
Some(id) => Ok(DocumentId(id as u32)),
|
||||||
|
None => {
|
||||||
|
let internal_id = available_docids.next().expect("no more ids available");
|
||||||
|
Ok(internal_id)
|
||||||
|
},
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
Err(SerializerError::InvalidDocumentIdFormat)
|
Err(SerializerError::InvalidDocumentIdFormat)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts and validates the document id of a document.
|
/// Extracts and validates the document id of a document.
|
||||||
pub fn extract_document_id(primary_key: &str, document: &IndexMap<String, Value>) -> Result<DocumentId, SerializerError> {
|
pub fn extract_document_id(
|
||||||
|
primary_key: &str,
|
||||||
|
document: &IndexMap<String, Value>,
|
||||||
|
external_docids: &fst::Map,
|
||||||
|
available_docids: &mut DiscoverIds<'_>,
|
||||||
|
) -> Result<(DocumentId, String), SerializerError>
|
||||||
|
{
|
||||||
match document.get(primary_key) {
|
match document.get(primary_key) {
|
||||||
Some(value) => {
|
Some(value) => {
|
||||||
let string = match value {
|
let docid = match value {
|
||||||
Value::Number(number) => number.to_string(),
|
Value::Number(number) => number.to_string(),
|
||||||
Value::String(string) => string.clone(),
|
Value::String(string) => string.clone(),
|
||||||
_ => return Err(SerializerError::InvalidDocumentIdFormat),
|
_ => return Err(SerializerError::InvalidDocumentIdFormat),
|
||||||
};
|
};
|
||||||
compute_document_id(&string)
|
discover_document_id(&docid, external_docids, available_docids).map(|id| (id, docid))
|
||||||
}
|
}
|
||||||
None => Err(SerializerError::DocumentIdNotFound),
|
None => Err(SerializerError::DocumentIdNotFound),
|
||||||
}
|
}
|
||||||
|
@ -9,7 +9,7 @@ pub use self::clear_all::{apply_clear_all, push_clear_all};
|
|||||||
pub use self::customs_update::{apply_customs_update, push_customs_update};
|
pub use self::customs_update::{apply_customs_update, push_customs_update};
|
||||||
pub use self::documents_addition::{apply_documents_addition, apply_documents_partial_addition, DocumentsAddition};
|
pub use self::documents_addition::{apply_documents_addition, apply_documents_partial_addition, DocumentsAddition};
|
||||||
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
|
||||||
pub use self::helpers::{index_value, value_to_string, value_to_number, compute_document_id, extract_document_id};
|
pub use self::helpers::{index_value, value_to_string, value_to_number, discover_document_id, extract_document_id};
|
||||||
pub use self::settings_update::{apply_settings_update, push_settings_update};
|
pub use self::settings_update::{apply_settings_update, push_settings_update};
|
||||||
|
|
||||||
use std::cmp;
|
use std::cmp;
|
||||||
@ -24,7 +24,7 @@ use sdset::Set;
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use crate::{store, DocumentId, MResult};
|
use crate::{store, MResult};
|
||||||
use crate::database::{MainT, UpdateT};
|
use crate::database::{MainT, UpdateT};
|
||||||
use crate::settings::SettingsUpdate;
|
use crate::settings::SettingsUpdate;
|
||||||
|
|
||||||
@ -63,7 +63,7 @@ impl Update {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_deletion(data: Vec<DocumentId>) -> Update {
|
fn documents_deletion(data: Vec<String>) -> Update {
|
||||||
Update {
|
Update {
|
||||||
data: UpdateData::DocumentsDeletion(data),
|
data: UpdateData::DocumentsDeletion(data),
|
||||||
enqueued_at: Utc::now(),
|
enqueued_at: Utc::now(),
|
||||||
@ -84,7 +84,7 @@ pub enum UpdateData {
|
|||||||
Customs(Vec<u8>),
|
Customs(Vec<u8>),
|
||||||
DocumentsAddition(Vec<IndexMap<String, Value>>),
|
DocumentsAddition(Vec<IndexMap<String, Value>>),
|
||||||
DocumentsPartial(Vec<IndexMap<String, Value>>),
|
DocumentsPartial(Vec<IndexMap<String, Value>>),
|
||||||
DocumentsDeletion(Vec<DocumentId>),
|
DocumentsDeletion(Vec<String>),
|
||||||
Settings(SettingsUpdate)
|
Settings(SettingsUpdate)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ pub enum ResponseError {
|
|||||||
NotFound(String),
|
NotFound(String),
|
||||||
OpenIndex(String),
|
OpenIndex(String),
|
||||||
FilterParsing(String),
|
FilterParsing(String),
|
||||||
RetrieveDocument(u64, String),
|
RetrieveDocument(u32, String),
|
||||||
SearchDocuments(String),
|
SearchDocuments(String),
|
||||||
PayloadTooLarge,
|
PayloadTooLarge,
|
||||||
UnsupportedMediaType,
|
UnsupportedMediaType,
|
||||||
@ -116,7 +116,7 @@ impl ResponseError {
|
|||||||
ResponseError::Maintenance
|
ResponseError::Maintenance
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn retrieve_document(doc_id: u64, err: impl fmt::Display) -> ResponseError {
|
pub fn retrieve_document(doc_id: u32, err: impl fmt::Display) -> ResponseError {
|
||||||
ResponseError::RetrieveDocument(doc_id, err.to_string())
|
ResponseError::RetrieveDocument(doc_id, err.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ use std::collections::{BTreeSet, HashSet};
|
|||||||
use actix_web::{web, HttpResponse};
|
use actix_web::{web, HttpResponse};
|
||||||
use actix_web_macros::{delete, get, post, put};
|
use actix_web_macros::{delete, get, post, put};
|
||||||
use indexmap::IndexMap;
|
use indexmap::IndexMap;
|
||||||
use meilisearch_core::{update, Error};
|
use meilisearch_core::update;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
@ -43,11 +43,13 @@ async fn get_document(
|
|||||||
.open_index(&path.index_uid)
|
.open_index(&path.index_uid)
|
||||||
.ok_or(ResponseError::index_not_found(&path.index_uid))?;
|
.ok_or(ResponseError::index_not_found(&path.index_uid))?;
|
||||||
|
|
||||||
let document_id = update::compute_document_id(&path.document_id).map_err(Error::Serializer)?;
|
|
||||||
let reader = data.db.main_read_txn()?;
|
let reader = data.db.main_read_txn()?;
|
||||||
|
let internal_id = index.main
|
||||||
|
.external_to_internal_docid(&reader, &path.document_id)?
|
||||||
|
.ok_or(ResponseError::document_not_found(&path.document_id))?;
|
||||||
|
|
||||||
let response: Document = index
|
let response: Document = index
|
||||||
.document(&reader, None, document_id)?
|
.document(&reader, None, internal_id)?
|
||||||
.ok_or(ResponseError::document_not_found(&path.document_id))?;
|
.ok_or(ResponseError::document_not_found(&path.document_id))?;
|
||||||
|
|
||||||
Ok(HttpResponse::Ok().json(response))
|
Ok(HttpResponse::Ok().json(response))
|
||||||
@ -66,12 +68,10 @@ async fn delete_document(
|
|||||||
.open_index(&path.index_uid)
|
.open_index(&path.index_uid)
|
||||||
.ok_or(ResponseError::index_not_found(&path.index_uid))?;
|
.ok_or(ResponseError::index_not_found(&path.index_uid))?;
|
||||||
|
|
||||||
let document_id = update::compute_document_id(&path.document_id).map_err(Error::Serializer)?;
|
|
||||||
|
|
||||||
let mut update_writer = data.db.update_write_txn()?;
|
let mut update_writer = data.db.update_write_txn()?;
|
||||||
|
|
||||||
let mut documents_deletion = index.documents_deletion();
|
let mut documents_deletion = index.documents_deletion();
|
||||||
documents_deletion.delete_document_by_id(document_id);
|
documents_deletion.delete_document_by_external_docid(path.document_id.clone());
|
||||||
|
|
||||||
let update_id = documents_deletion.finalize(&mut update_writer)?;
|
let update_id = documents_deletion.finalize(&mut update_writer)?;
|
||||||
|
|
||||||
@ -239,8 +239,7 @@ async fn delete_documents(
|
|||||||
|
|
||||||
for document_id in body.into_inner() {
|
for document_id in body.into_inner() {
|
||||||
let document_id = update::value_to_string(&document_id);
|
let document_id = update::value_to_string(&document_id);
|
||||||
let document_id = update::compute_document_id(&document_id).map_err(Error::Serializer)?;
|
documents_deletion.delete_document_by_external_docid(document_id);
|
||||||
documents_deletion.delete_document_by_id(document_id);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let update_id = documents_deletion.finalize(&mut writer)?;
|
let update_id = documents_deletion.finalize(&mut writer)?;
|
||||||
|
@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
||||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
pub struct DocumentId(pub u64);
|
pub struct DocumentId(pub u32);
|
||||||
|
|
||||||
/// This structure represent the position of a word
|
/// This structure represent the position of a word
|
||||||
/// in a document and its attributes.
|
/// in a document and its attributes.
|
||||||
|
Loading…
Reference in New Issue
Block a user