meilisearch/crates/milli/src/update/new/document_change.rs

281 lines
8.3 KiB
Rust
Raw Normal View History

2024-10-28 14:23:20 +01:00
use bumpalo::Bump;
use heed::RoTxn;
2024-12-05 15:50:12 +01:00
use super::document::{
Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions,
};
use super::extract::perm_json_p;
2024-10-28 16:18:48 +01:00
use super::vector_document::{
MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions,
};
2024-10-03 18:08:09 +02:00
use crate::documents::FieldIdMapper;
2024-10-30 13:50:51 +01:00
use crate::vector::EmbeddingConfigs;
2024-10-03 18:08:09 +02:00
use crate::{DocumentId, Index, Result};
2024-10-03 18:08:09 +02:00
pub enum DocumentChange<'doc> {
2024-10-14 15:40:42 +02:00
Deletion(Deletion<'doc>),
2024-10-03 18:08:09 +02:00
Update(Update<'doc>),
Insertion(Insertion<'doc>),
}
2024-10-14 15:40:42 +02:00
pub struct Deletion<'doc> {
2024-10-03 18:08:09 +02:00
docid: DocumentId,
2024-10-14 15:40:42 +02:00
external_document_id: &'doc str,
}
2024-10-03 18:08:09 +02:00
pub struct Update<'doc> {
docid: DocumentId,
2024-10-14 15:40:42 +02:00
external_document_id: &'doc str,
2024-10-28 14:23:20 +01:00
new: Versions<'doc>,
2024-10-03 18:08:09 +02:00
has_deletion: bool,
}
2024-10-03 18:08:09 +02:00
pub struct Insertion<'doc> {
docid: DocumentId,
2024-10-14 15:40:42 +02:00
external_document_id: &'doc str,
2024-10-28 14:23:20 +01:00
new: Versions<'doc>,
}
2024-10-03 18:08:09 +02:00
impl<'doc> DocumentChange<'doc> {
2024-09-03 11:02:39 +02:00
pub fn docid(&self) -> DocumentId {
match &self {
Self::Deletion(inner) => inner.docid(),
Self::Update(inner) => inner.docid(),
Self::Insertion(inner) => inner.docid(),
}
}
2024-10-03 18:08:09 +02:00
2024-10-14 15:40:42 +02:00
pub fn external_docid(&self) -> &'doc str {
2024-10-03 18:08:09 +02:00
match self {
DocumentChange::Deletion(deletion) => deletion.external_document_id(),
DocumentChange::Update(update) => update.external_document_id(),
DocumentChange::Insertion(insertion) => insertion.external_document_id(),
}
}
}
2024-10-14 15:40:42 +02:00
impl<'doc> Deletion<'doc> {
pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self {
2024-10-03 18:08:09 +02:00
Self { docid, external_document_id }
}
2024-09-03 11:02:39 +02:00
pub fn docid(&self) -> DocumentId {
self.docid
}
2024-10-14 15:40:42 +02:00
pub fn external_document_id(&self) -> &'doc str {
self.external_document_id
}
2024-10-03 18:08:09 +02:00
pub fn current<'a, Mapper: FieldIdMapper>(
&self,
rtxn: &'a RoTxn,
index: &'a Index,
2024-10-03 18:08:09 +02:00
mapper: &'a Mapper,
) -> Result<DocumentFromDb<'a, Mapper>> {
Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
)?)
}
}
2024-10-03 18:08:09 +02:00
impl<'doc> Insertion<'doc> {
2024-10-28 14:23:20 +01:00
pub fn create(docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>) -> Self {
Insertion { docid, external_document_id, new }
}
2024-09-03 11:02:39 +02:00
pub fn docid(&self) -> DocumentId {
self.docid
}
2024-10-14 15:40:42 +02:00
pub fn external_document_id(&self) -> &'doc str {
self.external_document_id
}
2024-10-28 16:18:48 +01:00
pub fn inserted(&self) -> DocumentFromVersions<'_, 'doc> {
2024-10-28 14:23:20 +01:00
DocumentFromVersions::new(&self.new)
}
pub fn inserted_vectors(
&self,
doc_alloc: &'doc Bump,
2024-10-30 13:50:51 +01:00
embedders: &'doc EmbeddingConfigs,
2024-10-28 14:23:20 +01:00
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
}
}
2024-10-03 18:08:09 +02:00
impl<'doc> Update<'doc> {
pub fn create(
docid: DocumentId,
2024-10-14 15:40:42 +02:00
external_document_id: &'doc str,
2024-10-28 14:23:20 +01:00
new: Versions<'doc>,
2024-10-03 18:08:09 +02:00
has_deletion: bool,
) -> Self {
2024-10-03 18:08:09 +02:00
Update { docid, new, external_document_id, has_deletion }
}
2024-09-03 11:02:39 +02:00
pub fn docid(&self) -> DocumentId {
self.docid
}
2024-10-14 15:40:42 +02:00
pub fn external_document_id(&self) -> &'doc str {
self.external_document_id
}
2024-10-03 18:08:09 +02:00
pub fn current<'a, Mapper: FieldIdMapper>(
&self,
rtxn: &'a RoTxn,
index: &'a Index,
2024-10-03 18:08:09 +02:00
mapper: &'a Mapper,
) -> Result<DocumentFromDb<'a, Mapper>> {
Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
)?)
}
2024-10-28 14:23:20 +01:00
pub fn current_vectors<'a, Mapper: FieldIdMapper>(
2024-10-03 18:08:09 +02:00
&self,
rtxn: &'a RoTxn,
index: &'a Index,
mapper: &'a Mapper,
2024-10-28 14:23:20 +01:00
doc_alloc: &'a Bump,
) -> Result<VectorDocumentFromDb<'a>> {
Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or(
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
)?)
}
pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> {
DocumentFromVersions::new(&self.new)
}
2024-10-28 16:18:48 +01:00
pub fn merged<'t, Mapper: FieldIdMapper>(
2024-10-28 14:23:20 +01:00
&self,
rtxn: &'t RoTxn,
index: &'t Index,
mapper: &'t Mapper,
) -> Result<MergedDocument<'_, 'doc, 't, Mapper>> {
2024-10-03 18:08:09 +02:00
if self.has_deletion {
2024-10-28 14:23:20 +01:00
Ok(MergedDocument::without_db(DocumentFromVersions::new(&self.new)))
2024-10-03 18:08:09 +02:00
} else {
2024-10-28 14:23:20 +01:00
MergedDocument::with_db(
self.docid,
rtxn,
index,
mapper,
DocumentFromVersions::new(&self.new),
)
2024-10-03 18:08:09 +02:00
}
}
2024-10-28 14:23:20 +01:00
2024-12-05 15:50:12 +01:00
/// Returns whether the updated version of the document is different from the current version for the passed subset of fields.
///
/// `true` if at least one top-level-field that is a exactly a member of field or a parent of a member of field changed.
/// Otherwise `false`.
pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>(
&self,
fields: Option<&[&str]>,
rtxn: &'t RoTxn,
index: &'t Index,
mapper: &'t Mapper,
) -> Result<bool> {
let mut changed = false;
let mut cached_current = None;
let mut updated_selected_field_count = 0;
for entry in self.updated().iter_top_level_fields() {
let (key, updated_value) = entry?;
if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip {
continue;
}
updated_selected_field_count += 1;
let current = match cached_current {
Some(current) => current,
None => self.current(rtxn, index, mapper)?,
};
let current_value = current.top_level_field(key)?;
let Some(current_value) = current_value else {
changed = true;
break;
};
if current_value.get() != updated_value.get() {
changed = true;
break;
}
cached_current = Some(current);
}
if !self.has_deletion {
// no field deletion, so fields that don't appear in `updated` cannot have changed
return Ok(changed);
}
if changed {
return Ok(true);
}
// we saw all updated fields, and set `changed` if any field wasn't in `current`.
// so if there are as many fields in `current` as in `updated`, then nothing changed.
// If there is any more fields in `current`, then they are missing in `updated`.
let has_deleted_fields = {
let current = match cached_current {
Some(current) => current,
None => self.current(rtxn, index, mapper)?,
};
let mut current_selected_field_count = 0;
for entry in current.iter_top_level_fields() {
let (key, _) = entry?;
if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip {
continue;
}
current_selected_field_count += 1;
}
current_selected_field_count != updated_selected_field_count
};
Ok(has_deleted_fields)
}
2024-10-28 14:23:20 +01:00
pub fn updated_vectors(
&self,
doc_alloc: &'doc Bump,
2024-10-30 13:50:51 +01:00
embedders: &'doc EmbeddingConfigs,
2024-10-28 14:23:20 +01:00
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
2024-10-28 14:23:20 +01:00
}
2024-10-28 16:18:48 +01:00
pub fn merged_vectors<Mapper: FieldIdMapper>(
&self,
rtxn: &'doc RoTxn,
index: &'doc Index,
mapper: &'doc Mapper,
doc_alloc: &'doc Bump,
2024-10-30 13:50:51 +01:00
embedders: &'doc EmbeddingConfigs,
2024-10-28 16:18:48 +01:00
) -> Result<Option<MergedVectorDocument<'doc>>> {
if self.has_deletion {
MergedVectorDocument::without_db(
self.external_document_id,
&self.new,
doc_alloc,
embedders,
)
2024-10-28 16:18:48 +01:00
} else {
2024-10-30 13:50:51 +01:00
MergedVectorDocument::with_db(
self.docid,
self.external_document_id,
index,
rtxn,
mapper,
&self.new,
doc_alloc,
embedders,
2024-10-30 13:50:51 +01:00
)
2024-10-28 16:18:48 +01:00
}
}
2024-10-03 18:08:09 +02:00
}