From 65470e26e0d14a3b10086c0d376d96cb6e08f99b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 28 Oct 2024 14:23:20 +0100 Subject: [PATCH] Document trait changes --- milli/src/update/new/document.rs | 110 +++++++++++++++++------- milli/src/update/new/document_change.rs | 68 +++++++++++---- 2 files changed, 129 insertions(+), 49 deletions(-) diff --git a/milli/src/update/new/document.rs b/milli/src/update/new/document.rs index 4948f8e31..be09feb5a 100644 --- a/milli/src/update/new/document.rs +++ b/milli/src/update/new/document.rs @@ -20,6 +20,14 @@ pub trait Document<'doc> { /// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning they are **not returned** by this method. fn iter_top_level_fields(&self) -> impl Iterator>; + fn len(&self) -> usize; + + fn is_empty(&self) -> bool { + self.len() == 0 + } + + fn top_level_field(&self, k: &str) -> Result>; + /// Returns the unparsed value of the `_vectors` field from the document data. /// /// This field alone is insufficient to retrieve vectors, as they may be stored in a dedicated location in the database. @@ -37,6 +45,7 @@ pub trait Document<'doc> { fn geo_field(&self) -> Result>; } +#[derive(Debug)] pub struct DocumentFromDb<'t, Mapper: FieldIdMapper> where Mapper: FieldIdMapper, @@ -84,6 +93,14 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { fn geo_field(&self) -> Result> { self.field("_geo") } + + fn len(&self) -> usize { + self.content.iter().count() + } + + fn top_level_field(&self, k: &str) -> Result> { + self.field(k) + } } impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> { @@ -107,18 +124,18 @@ impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> { } } -#[derive(Clone, Copy)] -pub struct DocumentFromVersions<'doc> { - versions: Versions<'doc>, +#[derive(Debug)] +pub struct DocumentFromVersions<'a, 'doc> { + versions: &'a Versions<'doc>, } -impl<'doc> DocumentFromVersions<'doc> { - pub fn new(versions: Versions<'doc>) -> Self { +impl<'a, 'doc> DocumentFromVersions<'a, 'doc> { + pub fn new(versions: &'a Versions<'doc>) -> Self { Self { versions } } } -impl<'doc> Document<'doc> for DocumentFromVersions<'doc> { +impl<'a, 'doc> Document<'doc> for DocumentFromVersions<'a, 'doc> { fn iter_top_level_fields(&self) -> impl Iterator> { self.versions.iter_top_level_fields().map(Ok) } @@ -130,16 +147,25 @@ impl<'doc> Document<'doc> for DocumentFromVersions<'doc> { fn geo_field(&self) -> Result> { Ok(self.versions.geo_field()) } + + fn len(&self) -> usize { + self.versions.len() + } + + fn top_level_field(&self, k: &str) -> Result> { + Ok(self.versions.top_level_field(k)) + } } -pub struct MergedDocument<'doc, 't, Mapper: FieldIdMapper> { - new_doc: DocumentFromVersions<'doc>, +#[derive(Debug)] +pub struct MergedDocument<'a, 'doc, 't, Mapper: FieldIdMapper> { + new_doc: DocumentFromVersions<'a, 'doc>, db: Option>, } -impl<'doc, 't, Mapper: FieldIdMapper> MergedDocument<'doc, 't, Mapper> { +impl<'a, 'doc, 't, Mapper: FieldIdMapper> MergedDocument<'a, 'doc, 't, Mapper> { pub fn new( - new_doc: DocumentFromVersions<'doc>, + new_doc: DocumentFromVersions<'a, 'doc>, db: Option>, ) -> Self { Self { new_doc, db } @@ -150,19 +176,19 @@ impl<'doc, 't, Mapper: FieldIdMapper> MergedDocument<'doc, 't, Mapper> { rtxn: &'t RoTxn, index: &'t Index, db_fields_ids_map: &'t Mapper, - new_doc: DocumentFromVersions<'doc>, + new_doc: DocumentFromVersions<'a, 'doc>, ) -> Result { let db = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)?; Ok(Self { new_doc, db }) } - pub fn without_db(new_doc: DocumentFromVersions<'doc>) -> Self { + pub fn without_db(new_doc: DocumentFromVersions<'a, 'doc>) -> Self { Self { new_doc, db: None } } } impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d> - for MergedDocument<'doc, 't, Mapper> + for MergedDocument<'d, 'doc, 't, Mapper> { fn iter_top_level_fields(&self) -> impl Iterator> { let mut new_doc_it = self.new_doc.iter_top_level_fields(); @@ -209,6 +235,20 @@ impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d> db.geo_field() } + + fn len(&self) -> usize { + self.iter_top_level_fields().count() + } + + fn top_level_field(&self, k: &str) -> Result> { + if let Some(f) = self.new_doc.top_level_field(k)? { + return Ok(Some(f)); + } + if let Some(db) = self.db { + return db.field(k); + } + Ok(None) + } } impl<'doc, D> Document<'doc> for &D @@ -226,6 +266,14 @@ where fn geo_field(&self) -> Result> { D::geo_field(self) } + + fn len(&self) -> usize { + D::len(self) + } + + fn top_level_field(&self, k: &str) -> Result> { + D::top_level_field(self, k) + } } /// Turn this document into an obkv, whose fields are indexed by the provided `FieldIdMapper`. @@ -301,11 +349,9 @@ where pub type Entry<'doc> = (&'doc str, &'doc RawValue); -#[derive(Clone, Copy)] +#[derive(Debug)] pub struct Versions<'doc> { - data: &'doc [Entry<'doc>], - vectors: Option<&'doc RawValue>, - geo: Option<&'doc RawValue>, + data: RawMap<'doc>, } impl<'doc> Versions<'doc> { @@ -324,26 +370,30 @@ impl<'doc> Versions<'doc> { } pub fn single(version: RawMap<'doc>) -> Self { - let vectors_id = version.get_index(RESERVED_VECTORS_FIELD_NAME); - let geo_id = version.get_index("_geo"); - let mut data = version.into_vec(); - let geo = geo_id.map(|geo_id| data.remove(geo_id).1); - let vectors = vectors_id.map(|vectors_id| data.remove(vectors_id).1); - - let data = data.into_bump_slice(); - - Self { data, geo, vectors } + Self { data: version } } - pub fn iter_top_level_fields(&self) -> impl Iterator> { - self.data.iter().copied() + pub fn iter_top_level_fields(&self) -> raw_collections::map::iter::Iter<'doc, '_> { + /// FIXME: ignore vectors and _geo + self.data.iter() } pub fn vectors_field(&self) -> Option<&'doc RawValue> { - self.vectors + self.data.get(RESERVED_VECTORS_FIELD_NAME) } pub fn geo_field(&self) -> Option<&'doc RawValue> { - self.geo + self.data.get("_geo") + } + + pub fn len(&self) -> usize { + self.data.len() + } + + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + pub fn top_level_field(&self, k: &str) -> Option<&'doc RawValue> { + self.data.get(k) } } diff --git a/milli/src/update/new/document_change.rs b/milli/src/update/new/document_change.rs index f277637d5..c55113b74 100644 --- a/milli/src/update/new/document_change.rs +++ b/milli/src/update/new/document_change.rs @@ -1,6 +1,8 @@ +use bumpalo::Bump; use heed::RoTxn; -use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument}; +use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions}; +use super::vector_document::{VectorDocumentFromDb, VectorDocumentFromVersions}; use crate::documents::FieldIdMapper; use crate::{DocumentId, Index, Result}; @@ -18,14 +20,14 @@ pub struct Deletion<'doc> { pub struct Update<'doc> { docid: DocumentId, external_document_id: &'doc str, - new: DocumentFromVersions<'doc>, + new: Versions<'doc>, has_deletion: bool, } pub struct Insertion<'doc> { docid: DocumentId, external_document_id: &'doc str, - new: DocumentFromVersions<'doc>, + new: Versions<'doc>, } impl<'doc> DocumentChange<'doc> { @@ -72,11 +74,7 @@ impl<'doc> Deletion<'doc> { } impl<'doc> Insertion<'doc> { - pub fn create( - docid: DocumentId, - external_document_id: &'doc str, - new: DocumentFromVersions<'doc>, - ) -> Self { + pub fn create(docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>) -> Self { Insertion { docid, external_document_id, new } } @@ -87,8 +85,15 @@ impl<'doc> Insertion<'doc> { pub fn external_document_id(&self) -> &'doc str { self.external_document_id } - pub fn new(&self) -> DocumentFromVersions<'doc> { - self.new + pub fn new(&self) -> DocumentFromVersions<'_, 'doc> { + DocumentFromVersions::new(&self.new) + } + + pub fn inserted_vectors( + &self, + doc_alloc: &'doc Bump, + ) -> Result>> { + VectorDocumentFromVersions::new(&self.new, doc_alloc) } } @@ -96,7 +101,7 @@ impl<'doc> Update<'doc> { pub fn create( docid: DocumentId, external_document_id: &'doc str, - new: DocumentFromVersions<'doc>, + new: Versions<'doc>, has_deletion: bool, ) -> Self { Update { docid, new, external_document_id, has_deletion } @@ -120,20 +125,45 @@ impl<'doc> Update<'doc> { )?) } - pub fn updated(&self) -> DocumentFromVersions<'doc> { - self.new - } - - pub fn new<'a, Mapper: FieldIdMapper>( + pub fn current_vectors<'a, Mapper: FieldIdMapper>( &self, rtxn: &'a RoTxn, index: &'a Index, mapper: &'a Mapper, - ) -> Result> { + doc_alloc: &'a Bump, + ) -> Result> { + Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or( + crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, + )?) + } + + pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> { + DocumentFromVersions::new(&self.new) + } + + pub fn new<'t, Mapper: FieldIdMapper>( + &self, + rtxn: &'t RoTxn, + index: &'t Index, + mapper: &'t Mapper, + ) -> Result> { if self.has_deletion { - Ok(MergedDocument::without_db(self.new)) + Ok(MergedDocument::without_db(DocumentFromVersions::new(&self.new))) } else { - MergedDocument::with_db(self.docid, rtxn, index, mapper, self.new) + MergedDocument::with_db( + self.docid, + rtxn, + index, + mapper, + DocumentFromVersions::new(&self.new), + ) } } + + pub fn updated_vectors( + &self, + doc_alloc: &'doc Bump, + ) -> Result>> { + VectorDocumentFromVersions::new(&self.new, doc_alloc) + } }