Make documents additions accept only the last duplicate document

This commit is contained in:
Clément Renault 2019-10-28 20:40:33 +01:00
parent af96050944
commit ac12a4b9c9
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
2 changed files with 8 additions and 16 deletions

View File

@ -12,7 +12,6 @@ pub enum Error {
SchemaMissing, SchemaMissing,
WordIndexMissing, WordIndexMissing,
MissingDocumentId, MissingDocumentId,
DuplicateDocument,
Zlmdb(heed::Error), Zlmdb(heed::Error),
Fst(fst::Error), Fst(fst::Error),
SerdeJson(SerdeJsonError), SerdeJson(SerdeJsonError),
@ -80,7 +79,6 @@ impl fmt::Display for Error {
SchemaMissing => write!(f, "this index does not have a schema"), SchemaMissing => write!(f, "this index does not have a schema"),
WordIndexMissing => write!(f, "this index does not have a word index"), WordIndexMissing => write!(f, "this index does not have a word index"),
MissingDocumentId => write!(f, "document id is missing"), MissingDocumentId => write!(f, "document id is missing"),
DuplicateDocument => write!(f, "update contains documents with the same id"),
Zlmdb(e) => write!(f, "heed error; {}", e), Zlmdb(e) => write!(f, "heed error; {}", e),
Fst(e) => write!(f, "fst error; {}", e), Fst(e) => write!(f, "fst error; {}", e),
SerdeJson(e) => write!(f, "serde json error; {}", e), SerdeJson(e) => write!(f, "serde json error; {}", e),

View File

@ -1,4 +1,4 @@
use std::collections::{HashMap, HashSet}; use std::collections::HashMap;
use fst::{set::OpBuilder, SetBuilder}; use fst::{set::OpBuilder, SetBuilder};
use sdset::{duo::Union, SetOperation}; use sdset::{duo::Union, SetOperation};
@ -86,7 +86,7 @@ pub fn apply_documents_addition(
docs_words_store: store::DocsWords, docs_words_store: store::DocsWords,
addition: Vec<serde_json::Value>, addition: Vec<serde_json::Value>,
) -> MResult<()> { ) -> MResult<()> {
let mut documents_ids = HashSet::new(); let mut documents_additions = HashMap::new();
let mut indexer = RawIndexer::new(); let mut indexer = RawIndexer::new();
let schema = match main_store.schema(writer)? { let schema = match main_store.schema(writer)? {
@ -97,19 +97,18 @@ pub fn apply_documents_addition(
let identifier = schema.identifier_name(); let identifier = schema.identifier_name();
// 1. store documents ids for future deletion // 1. store documents ids for future deletion
for document in addition.iter() { for document in addition {
let document_id = match extract_document_id(identifier, &document)? { let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id, Some(id) => id,
None => return Err(Error::MissingDocumentId), None => return Err(Error::MissingDocumentId),
}; };
if !documents_ids.insert(document_id) { documents_additions.insert(document_id, document);
return Err(Error::DuplicateDocument);
}
} }
// 2. remove the documents posting lists // 2. remove the documents posting lists
let number_of_inserted_documents = documents_ids.len(); let number_of_inserted_documents = documents_additions.len();
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
apply_documents_deletion( apply_documents_deletion(
writer, writer,
main_store, main_store,
@ -117,7 +116,7 @@ pub fn apply_documents_addition(
documents_fields_counts_store, documents_fields_counts_store,
postings_lists_store, postings_lists_store,
docs_words_store, docs_words_store,
documents_ids.into_iter().collect(), documents_ids,
)?; )?;
let mut ranked_map = match main_store.ranked_map(writer)? { let mut ranked_map = match main_store.ranked_map(writer)? {
@ -126,12 +125,7 @@ pub fn apply_documents_addition(
}; };
// 3. index the documents fields in the stores // 3. index the documents fields in the stores
for document in addition { for (document_id, document) in documents_additions {
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
let serializer = Serializer { let serializer = Serializer {
txn: writer, txn: writer,
schema: &schema, schema: &schema,