2024-10-28 14:23:20 +01:00
|
|
|
use bumpalo::Bump;
|
2024-08-28 18:45:16 +02:00
|
|
|
use heed::RoTxn;
|
|
|
|
|
2024-12-05 15:50:12 +01:00
|
|
|
use super::document::{
|
|
|
|
Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions,
|
|
|
|
};
|
|
|
|
use super::extract::perm_json_p;
|
2024-10-28 16:18:48 +01:00
|
|
|
use super::vector_document::{
|
|
|
|
MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions,
|
|
|
|
};
|
2024-10-03 18:08:09 +02:00
|
|
|
use crate::documents::FieldIdMapper;
|
2024-10-30 13:50:51 +01:00
|
|
|
use crate::vector::EmbeddingConfigs;
|
2024-10-03 18:08:09 +02:00
|
|
|
use crate::{DocumentId, Index, Result};
|
2024-08-28 18:45:16 +02:00
|
|
|
|
2024-10-03 18:08:09 +02:00
|
|
|
pub enum DocumentChange<'doc> {
|
2024-10-14 15:40:42 +02:00
|
|
|
Deletion(Deletion<'doc>),
|
2024-10-03 18:08:09 +02:00
|
|
|
Update(Update<'doc>),
|
|
|
|
Insertion(Insertion<'doc>),
|
2024-08-28 18:45:16 +02:00
|
|
|
}
|
|
|
|
|
2024-10-14 15:40:42 +02:00
|
|
|
pub struct Deletion<'doc> {
|
2024-10-03 18:08:09 +02:00
|
|
|
docid: DocumentId,
|
2024-10-14 15:40:42 +02:00
|
|
|
external_document_id: &'doc str,
|
2024-08-28 18:45:16 +02:00
|
|
|
}
|
|
|
|
|
2024-10-03 18:08:09 +02:00
|
|
|
pub struct Update<'doc> {
|
|
|
|
docid: DocumentId,
|
2024-10-14 15:40:42 +02:00
|
|
|
external_document_id: &'doc str,
|
2024-10-28 14:23:20 +01:00
|
|
|
new: Versions<'doc>,
|
2024-10-03 18:08:09 +02:00
|
|
|
has_deletion: bool,
|
2024-08-28 18:45:16 +02:00
|
|
|
}
|
|
|
|
|
2024-10-03 18:08:09 +02:00
|
|
|
pub struct Insertion<'doc> {
|
|
|
|
docid: DocumentId,
|
2024-10-14 15:40:42 +02:00
|
|
|
external_document_id: &'doc str,
|
2024-10-28 14:23:20 +01:00
|
|
|
new: Versions<'doc>,
|
2024-08-28 18:45:16 +02:00
|
|
|
}
|
|
|
|
|
2024-10-03 18:08:09 +02:00
|
|
|
impl<'doc> DocumentChange<'doc> {
|
2024-09-03 11:02:39 +02:00
|
|
|
pub fn docid(&self) -> DocumentId {
|
2024-08-28 18:45:16 +02:00
|
|
|
match &self {
|
|
|
|
Self::Deletion(inner) => inner.docid(),
|
|
|
|
Self::Update(inner) => inner.docid(),
|
|
|
|
Self::Insertion(inner) => inner.docid(),
|
|
|
|
}
|
|
|
|
}
|
2024-10-03 18:08:09 +02:00
|
|
|
|
2024-10-14 15:40:42 +02:00
|
|
|
pub fn external_docid(&self) -> &'doc str {
|
2024-10-03 18:08:09 +02:00
|
|
|
match self {
|
|
|
|
DocumentChange::Deletion(deletion) => deletion.external_document_id(),
|
|
|
|
DocumentChange::Update(update) => update.external_document_id(),
|
|
|
|
DocumentChange::Insertion(insertion) => insertion.external_document_id(),
|
|
|
|
}
|
|
|
|
}
|
2024-08-28 18:45:16 +02:00
|
|
|
}
|
|
|
|
|
2024-10-14 15:40:42 +02:00
|
|
|
impl<'doc> Deletion<'doc> {
|
|
|
|
pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self {
|
2024-10-03 18:08:09 +02:00
|
|
|
Self { docid, external_document_id }
|
2024-08-28 18:45:16 +02:00
|
|
|
}
|
|
|
|
|
2024-09-03 11:02:39 +02:00
|
|
|
pub fn docid(&self) -> DocumentId {
|
2024-08-28 18:45:16 +02:00
|
|
|
self.docid
|
|
|
|
}
|
|
|
|
|
2024-10-14 15:40:42 +02:00
|
|
|
pub fn external_document_id(&self) -> &'doc str {
|
|
|
|
self.external_document_id
|
2024-10-02 11:32:19 +02:00
|
|
|
}
|
|
|
|
|
2024-10-03 18:08:09 +02:00
|
|
|
pub fn current<'a, Mapper: FieldIdMapper>(
|
2024-09-03 15:14:16 +02:00
|
|
|
&self,
|
|
|
|
rtxn: &'a RoTxn,
|
|
|
|
index: &'a Index,
|
2024-10-03 18:08:09 +02:00
|
|
|
mapper: &'a Mapper,
|
|
|
|
) -> Result<DocumentFromDb<'a, Mapper>> {
|
|
|
|
Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
|
|
|
|
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
|
|
|
)?)
|
2024-08-28 18:45:16 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-03 18:08:09 +02:00
|
|
|
impl<'doc> Insertion<'doc> {
|
2024-10-28 14:23:20 +01:00
|
|
|
pub fn create(docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>) -> Self {
|
2024-10-02 11:32:19 +02:00
|
|
|
Insertion { docid, external_document_id, new }
|
2024-08-29 12:06:44 +02:00
|
|
|
}
|
|
|
|
|
2024-09-03 11:02:39 +02:00
|
|
|
pub fn docid(&self) -> DocumentId {
|
2024-08-28 18:45:16 +02:00
|
|
|
self.docid
|
|
|
|
}
|
|
|
|
|
2024-10-14 15:40:42 +02:00
|
|
|
pub fn external_document_id(&self) -> &'doc str {
|
|
|
|
self.external_document_id
|
2024-10-02 11:32:19 +02:00
|
|
|
}
|
2024-10-28 16:18:48 +01:00
|
|
|
pub fn inserted(&self) -> DocumentFromVersions<'_, 'doc> {
|
2024-10-28 14:23:20 +01:00
|
|
|
DocumentFromVersions::new(&self.new)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn inserted_vectors(
|
|
|
|
&self,
|
|
|
|
doc_alloc: &'doc Bump,
|
2024-10-30 13:50:51 +01:00
|
|
|
embedders: &'doc EmbeddingConfigs,
|
2024-10-28 14:23:20 +01:00
|
|
|
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
|
2024-11-12 22:49:22 +01:00
|
|
|
VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
|
2024-08-28 18:45:16 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-03 18:08:09 +02:00
|
|
|
impl<'doc> Update<'doc> {
|
2024-08-29 12:06:44 +02:00
|
|
|
pub fn create(
|
|
|
|
docid: DocumentId,
|
2024-10-14 15:40:42 +02:00
|
|
|
external_document_id: &'doc str,
|
2024-10-28 14:23:20 +01:00
|
|
|
new: Versions<'doc>,
|
2024-10-03 18:08:09 +02:00
|
|
|
has_deletion: bool,
|
2024-08-29 12:06:44 +02:00
|
|
|
) -> Self {
|
2024-10-03 18:08:09 +02:00
|
|
|
Update { docid, new, external_document_id, has_deletion }
|
2024-08-29 12:06:44 +02:00
|
|
|
}
|
|
|
|
|
2024-09-03 11:02:39 +02:00
|
|
|
pub fn docid(&self) -> DocumentId {
|
2024-08-28 18:45:16 +02:00
|
|
|
self.docid
|
|
|
|
}
|
|
|
|
|
2024-10-14 15:40:42 +02:00
|
|
|
pub fn external_document_id(&self) -> &'doc str {
|
|
|
|
self.external_document_id
|
2024-10-02 11:32:19 +02:00
|
|
|
}
|
2024-10-03 18:08:09 +02:00
|
|
|
pub fn current<'a, Mapper: FieldIdMapper>(
|
2024-09-03 15:14:16 +02:00
|
|
|
&self,
|
|
|
|
rtxn: &'a RoTxn,
|
|
|
|
index: &'a Index,
|
2024-10-03 18:08:09 +02:00
|
|
|
mapper: &'a Mapper,
|
|
|
|
) -> Result<DocumentFromDb<'a, Mapper>> {
|
|
|
|
Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
|
|
|
|
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
|
|
|
)?)
|
2024-08-28 18:45:16 +02:00
|
|
|
}
|
|
|
|
|
2024-10-28 14:23:20 +01:00
|
|
|
pub fn current_vectors<'a, Mapper: FieldIdMapper>(
|
2024-10-03 18:08:09 +02:00
|
|
|
&self,
|
|
|
|
rtxn: &'a RoTxn,
|
|
|
|
index: &'a Index,
|
|
|
|
mapper: &'a Mapper,
|
2024-10-28 14:23:20 +01:00
|
|
|
doc_alloc: &'a Bump,
|
|
|
|
) -> Result<VectorDocumentFromDb<'a>> {
|
|
|
|
Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or(
|
|
|
|
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
|
|
|
)?)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> {
|
|
|
|
DocumentFromVersions::new(&self.new)
|
|
|
|
}
|
|
|
|
|
2024-10-28 16:18:48 +01:00
|
|
|
pub fn merged<'t, Mapper: FieldIdMapper>(
|
2024-10-28 14:23:20 +01:00
|
|
|
&self,
|
|
|
|
rtxn: &'t RoTxn,
|
|
|
|
index: &'t Index,
|
|
|
|
mapper: &'t Mapper,
|
|
|
|
) -> Result<MergedDocument<'_, 'doc, 't, Mapper>> {
|
2024-10-03 18:08:09 +02:00
|
|
|
if self.has_deletion {
|
2024-10-28 14:23:20 +01:00
|
|
|
Ok(MergedDocument::without_db(DocumentFromVersions::new(&self.new)))
|
2024-10-03 18:08:09 +02:00
|
|
|
} else {
|
2024-10-28 14:23:20 +01:00
|
|
|
MergedDocument::with_db(
|
|
|
|
self.docid,
|
|
|
|
rtxn,
|
|
|
|
index,
|
|
|
|
mapper,
|
|
|
|
DocumentFromVersions::new(&self.new),
|
|
|
|
)
|
2024-10-03 18:08:09 +02:00
|
|
|
}
|
|
|
|
}
|
2024-10-28 14:23:20 +01:00
|
|
|
|
2024-12-05 15:50:12 +01:00
|
|
|
/// Returns whether the updated version of the document is different from the current version for the passed subset of fields.
|
|
|
|
///
|
|
|
|
/// `true` if at least one top-level-field that is a exactly a member of field or a parent of a member of field changed.
|
|
|
|
/// Otherwise `false`.
|
|
|
|
pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>(
|
|
|
|
&self,
|
|
|
|
fields: Option<&[&str]>,
|
|
|
|
rtxn: &'t RoTxn,
|
|
|
|
index: &'t Index,
|
|
|
|
mapper: &'t Mapper,
|
|
|
|
) -> Result<bool> {
|
|
|
|
let mut changed = false;
|
|
|
|
let mut cached_current = None;
|
|
|
|
let mut updated_selected_field_count = 0;
|
|
|
|
|
|
|
|
for entry in self.updated().iter_top_level_fields() {
|
|
|
|
let (key, updated_value) = entry?;
|
|
|
|
|
|
|
|
if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
updated_selected_field_count += 1;
|
|
|
|
let current = match cached_current {
|
|
|
|
Some(current) => current,
|
|
|
|
None => self.current(rtxn, index, mapper)?,
|
|
|
|
};
|
|
|
|
let current_value = current.top_level_field(key)?;
|
|
|
|
let Some(current_value) = current_value else {
|
|
|
|
changed = true;
|
|
|
|
break;
|
|
|
|
};
|
|
|
|
|
|
|
|
if current_value.get() != updated_value.get() {
|
|
|
|
changed = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
cached_current = Some(current);
|
|
|
|
}
|
|
|
|
|
|
|
|
if !self.has_deletion {
|
|
|
|
// no field deletion, so fields that don't appear in `updated` cannot have changed
|
|
|
|
return Ok(changed);
|
|
|
|
}
|
|
|
|
|
|
|
|
if changed {
|
|
|
|
return Ok(true);
|
|
|
|
}
|
|
|
|
|
|
|
|
// we saw all updated fields, and set `changed` if any field wasn't in `current`.
|
|
|
|
// so if there are as many fields in `current` as in `updated`, then nothing changed.
|
|
|
|
// If there is any more fields in `current`, then they are missing in `updated`.
|
|
|
|
let has_deleted_fields = {
|
|
|
|
let current = match cached_current {
|
|
|
|
Some(current) => current,
|
|
|
|
None => self.current(rtxn, index, mapper)?,
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut current_selected_field_count = 0;
|
|
|
|
for entry in current.iter_top_level_fields() {
|
|
|
|
let (key, _) = entry?;
|
|
|
|
|
|
|
|
if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
current_selected_field_count += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
current_selected_field_count != updated_selected_field_count
|
|
|
|
};
|
|
|
|
|
|
|
|
Ok(has_deleted_fields)
|
|
|
|
}
|
|
|
|
|
2024-10-28 14:23:20 +01:00
|
|
|
pub fn updated_vectors(
|
|
|
|
&self,
|
|
|
|
doc_alloc: &'doc Bump,
|
2024-10-30 13:50:51 +01:00
|
|
|
embedders: &'doc EmbeddingConfigs,
|
2024-10-28 14:23:20 +01:00
|
|
|
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
|
2024-11-12 22:49:22 +01:00
|
|
|
VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
|
2024-10-28 14:23:20 +01:00
|
|
|
}
|
2024-10-28 16:18:48 +01:00
|
|
|
|
|
|
|
pub fn merged_vectors<Mapper: FieldIdMapper>(
|
|
|
|
&self,
|
|
|
|
rtxn: &'doc RoTxn,
|
|
|
|
index: &'doc Index,
|
|
|
|
mapper: &'doc Mapper,
|
|
|
|
doc_alloc: &'doc Bump,
|
2024-10-30 13:50:51 +01:00
|
|
|
embedders: &'doc EmbeddingConfigs,
|
2024-10-28 16:18:48 +01:00
|
|
|
) -> Result<Option<MergedVectorDocument<'doc>>> {
|
|
|
|
if self.has_deletion {
|
2024-11-12 22:49:22 +01:00
|
|
|
MergedVectorDocument::without_db(
|
|
|
|
self.external_document_id,
|
|
|
|
&self.new,
|
|
|
|
doc_alloc,
|
|
|
|
embedders,
|
|
|
|
)
|
2024-10-28 16:18:48 +01:00
|
|
|
} else {
|
2024-10-30 13:50:51 +01:00
|
|
|
MergedVectorDocument::with_db(
|
2024-11-12 22:49:22 +01:00
|
|
|
self.docid,
|
|
|
|
self.external_document_id,
|
|
|
|
index,
|
|
|
|
rtxn,
|
|
|
|
mapper,
|
|
|
|
&self.new,
|
|
|
|
doc_alloc,
|
|
|
|
embedders,
|
2024-10-30 13:50:51 +01:00
|
|
|
)
|
2024-10-28 16:18:48 +01:00
|
|
|
}
|
|
|
|
}
|
2024-10-03 18:08:09 +02:00
|
|
|
}
|