Document trait changes

This commit is contained in:
Louis Dureuil 2024-10-28 14:23:20 +01:00
parent bbb67ae0a8
commit 65470e26e0
No known key found for this signature in database
2 changed files with 129 additions and 49 deletions

View File

@ -20,6 +20,14 @@ pub trait Document<'doc> {
/// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning they are **not returned** by this method. /// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning they are **not returned** by this method.
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>>; fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>>;
fn len(&self) -> usize;
fn is_empty(&self) -> bool {
self.len() == 0
}
fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>>;
/// Returns the unparsed value of the `_vectors` field from the document data. /// Returns the unparsed value of the `_vectors` field from the document data.
/// ///
/// This field alone is insufficient to retrieve vectors, as they may be stored in a dedicated location in the database. /// This field alone is insufficient to retrieve vectors, as they may be stored in a dedicated location in the database.
@ -37,6 +45,7 @@ pub trait Document<'doc> {
fn geo_field(&self) -> Result<Option<&'doc RawValue>>; fn geo_field(&self) -> Result<Option<&'doc RawValue>>;
} }
#[derive(Debug)]
pub struct DocumentFromDb<'t, Mapper: FieldIdMapper> pub struct DocumentFromDb<'t, Mapper: FieldIdMapper>
where where
Mapper: FieldIdMapper, Mapper: FieldIdMapper,
@ -84,6 +93,14 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> {
fn geo_field(&self) -> Result<Option<&'t RawValue>> { fn geo_field(&self) -> Result<Option<&'t RawValue>> {
self.field("_geo") self.field("_geo")
} }
fn len(&self) -> usize {
self.content.iter().count()
}
fn top_level_field(&self, k: &str) -> Result<Option<&'t RawValue>> {
self.field(k)
}
} }
impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> { impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> {
@ -107,18 +124,18 @@ impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> {
} }
} }
#[derive(Clone, Copy)] #[derive(Debug)]
pub struct DocumentFromVersions<'doc> { pub struct DocumentFromVersions<'a, 'doc> {
versions: Versions<'doc>, versions: &'a Versions<'doc>,
} }
impl<'doc> DocumentFromVersions<'doc> { impl<'a, 'doc> DocumentFromVersions<'a, 'doc> {
pub fn new(versions: Versions<'doc>) -> Self { pub fn new(versions: &'a Versions<'doc>) -> Self {
Self { versions } Self { versions }
} }
} }
impl<'doc> Document<'doc> for DocumentFromVersions<'doc> { impl<'a, 'doc> Document<'doc> for DocumentFromVersions<'a, 'doc> {
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>> { fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>> {
self.versions.iter_top_level_fields().map(Ok) self.versions.iter_top_level_fields().map(Ok)
} }
@ -130,16 +147,25 @@ impl<'doc> Document<'doc> for DocumentFromVersions<'doc> {
fn geo_field(&self) -> Result<Option<&'doc RawValue>> { fn geo_field(&self) -> Result<Option<&'doc RawValue>> {
Ok(self.versions.geo_field()) Ok(self.versions.geo_field())
} }
fn len(&self) -> usize {
self.versions.len()
}
fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
Ok(self.versions.top_level_field(k))
}
} }
pub struct MergedDocument<'doc, 't, Mapper: FieldIdMapper> { #[derive(Debug)]
new_doc: DocumentFromVersions<'doc>, pub struct MergedDocument<'a, 'doc, 't, Mapper: FieldIdMapper> {
new_doc: DocumentFromVersions<'a, 'doc>,
db: Option<DocumentFromDb<'t, Mapper>>, db: Option<DocumentFromDb<'t, Mapper>>,
} }
impl<'doc, 't, Mapper: FieldIdMapper> MergedDocument<'doc, 't, Mapper> { impl<'a, 'doc, 't, Mapper: FieldIdMapper> MergedDocument<'a, 'doc, 't, Mapper> {
pub fn new( pub fn new(
new_doc: DocumentFromVersions<'doc>, new_doc: DocumentFromVersions<'a, 'doc>,
db: Option<DocumentFromDb<'t, Mapper>>, db: Option<DocumentFromDb<'t, Mapper>>,
) -> Self { ) -> Self {
Self { new_doc, db } Self { new_doc, db }
@ -150,19 +176,19 @@ impl<'doc, 't, Mapper: FieldIdMapper> MergedDocument<'doc, 't, Mapper> {
rtxn: &'t RoTxn, rtxn: &'t RoTxn,
index: &'t Index, index: &'t Index,
db_fields_ids_map: &'t Mapper, db_fields_ids_map: &'t Mapper,
new_doc: DocumentFromVersions<'doc>, new_doc: DocumentFromVersions<'a, 'doc>,
) -> Result<Self> { ) -> Result<Self> {
let db = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)?; let db = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)?;
Ok(Self { new_doc, db }) Ok(Self { new_doc, db })
} }
pub fn without_db(new_doc: DocumentFromVersions<'doc>) -> Self { pub fn without_db(new_doc: DocumentFromVersions<'a, 'doc>) -> Self {
Self { new_doc, db: None } Self { new_doc, db: None }
} }
} }
impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d> impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d>
for MergedDocument<'doc, 't, Mapper> for MergedDocument<'d, 'doc, 't, Mapper>
{ {
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'d str, &'d RawValue)>> { fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'d str, &'d RawValue)>> {
let mut new_doc_it = self.new_doc.iter_top_level_fields(); let mut new_doc_it = self.new_doc.iter_top_level_fields();
@ -209,6 +235,20 @@ impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d>
db.geo_field() db.geo_field()
} }
fn len(&self) -> usize {
self.iter_top_level_fields().count()
}
fn top_level_field(&self, k: &str) -> Result<Option<&'d RawValue>> {
if let Some(f) = self.new_doc.top_level_field(k)? {
return Ok(Some(f));
}
if let Some(db) = self.db {
return db.field(k);
}
Ok(None)
}
} }
impl<'doc, D> Document<'doc> for &D impl<'doc, D> Document<'doc> for &D
@ -226,6 +266,14 @@ where
fn geo_field(&self) -> Result<Option<&'doc RawValue>> { fn geo_field(&self) -> Result<Option<&'doc RawValue>> {
D::geo_field(self) D::geo_field(self)
} }
fn len(&self) -> usize {
D::len(self)
}
fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
D::top_level_field(self, k)
}
} }
/// Turn this document into an obkv, whose fields are indexed by the provided `FieldIdMapper`. /// Turn this document into an obkv, whose fields are indexed by the provided `FieldIdMapper`.
@ -301,11 +349,9 @@ where
pub type Entry<'doc> = (&'doc str, &'doc RawValue); pub type Entry<'doc> = (&'doc str, &'doc RawValue);
#[derive(Clone, Copy)] #[derive(Debug)]
pub struct Versions<'doc> { pub struct Versions<'doc> {
data: &'doc [Entry<'doc>], data: RawMap<'doc>,
vectors: Option<&'doc RawValue>,
geo: Option<&'doc RawValue>,
} }
impl<'doc> Versions<'doc> { impl<'doc> Versions<'doc> {
@ -324,26 +370,30 @@ impl<'doc> Versions<'doc> {
} }
pub fn single(version: RawMap<'doc>) -> Self { pub fn single(version: RawMap<'doc>) -> Self {
let vectors_id = version.get_index(RESERVED_VECTORS_FIELD_NAME); Self { data: version }
let geo_id = version.get_index("_geo");
let mut data = version.into_vec();
let geo = geo_id.map(|geo_id| data.remove(geo_id).1);
let vectors = vectors_id.map(|vectors_id| data.remove(vectors_id).1);
let data = data.into_bump_slice();
Self { data, geo, vectors }
} }
pub fn iter_top_level_fields(&self) -> impl Iterator<Item = Entry<'doc>> { pub fn iter_top_level_fields(&self) -> raw_collections::map::iter::Iter<'doc, '_> {
self.data.iter().copied() /// FIXME: ignore vectors and _geo
self.data.iter()
} }
pub fn vectors_field(&self) -> Option<&'doc RawValue> { pub fn vectors_field(&self) -> Option<&'doc RawValue> {
self.vectors self.data.get(RESERVED_VECTORS_FIELD_NAME)
} }
pub fn geo_field(&self) -> Option<&'doc RawValue> { pub fn geo_field(&self) -> Option<&'doc RawValue> {
self.geo self.data.get("_geo")
}
pub fn len(&self) -> usize {
self.data.len()
}
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
pub fn top_level_field(&self, k: &str) -> Option<&'doc RawValue> {
self.data.get(k)
} }
} }

View File

@ -1,6 +1,8 @@
use bumpalo::Bump;
use heed::RoTxn; use heed::RoTxn;
use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument}; use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions};
use super::vector_document::{VectorDocumentFromDb, VectorDocumentFromVersions};
use crate::documents::FieldIdMapper; use crate::documents::FieldIdMapper;
use crate::{DocumentId, Index, Result}; use crate::{DocumentId, Index, Result};
@ -18,14 +20,14 @@ pub struct Deletion<'doc> {
pub struct Update<'doc> { pub struct Update<'doc> {
docid: DocumentId, docid: DocumentId,
external_document_id: &'doc str, external_document_id: &'doc str,
new: DocumentFromVersions<'doc>, new: Versions<'doc>,
has_deletion: bool, has_deletion: bool,
} }
pub struct Insertion<'doc> { pub struct Insertion<'doc> {
docid: DocumentId, docid: DocumentId,
external_document_id: &'doc str, external_document_id: &'doc str,
new: DocumentFromVersions<'doc>, new: Versions<'doc>,
} }
impl<'doc> DocumentChange<'doc> { impl<'doc> DocumentChange<'doc> {
@ -72,11 +74,7 @@ impl<'doc> Deletion<'doc> {
} }
impl<'doc> Insertion<'doc> { impl<'doc> Insertion<'doc> {
pub fn create( pub fn create(docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>) -> Self {
docid: DocumentId,
external_document_id: &'doc str,
new: DocumentFromVersions<'doc>,
) -> Self {
Insertion { docid, external_document_id, new } Insertion { docid, external_document_id, new }
} }
@ -87,8 +85,15 @@ impl<'doc> Insertion<'doc> {
pub fn external_document_id(&self) -> &'doc str { pub fn external_document_id(&self) -> &'doc str {
self.external_document_id self.external_document_id
} }
pub fn new(&self) -> DocumentFromVersions<'doc> { pub fn new(&self) -> DocumentFromVersions<'_, 'doc> {
self.new DocumentFromVersions::new(&self.new)
}
pub fn inserted_vectors(
&self,
doc_alloc: &'doc Bump,
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
VectorDocumentFromVersions::new(&self.new, doc_alloc)
} }
} }
@ -96,7 +101,7 @@ impl<'doc> Update<'doc> {
pub fn create( pub fn create(
docid: DocumentId, docid: DocumentId,
external_document_id: &'doc str, external_document_id: &'doc str,
new: DocumentFromVersions<'doc>, new: Versions<'doc>,
has_deletion: bool, has_deletion: bool,
) -> Self { ) -> Self {
Update { docid, new, external_document_id, has_deletion } Update { docid, new, external_document_id, has_deletion }
@ -120,20 +125,45 @@ impl<'doc> Update<'doc> {
)?) )?)
} }
pub fn updated(&self) -> DocumentFromVersions<'doc> { pub fn current_vectors<'a, Mapper: FieldIdMapper>(
self.new
}
pub fn new<'a, Mapper: FieldIdMapper>(
&self, &self,
rtxn: &'a RoTxn, rtxn: &'a RoTxn,
index: &'a Index, index: &'a Index,
mapper: &'a Mapper, mapper: &'a Mapper,
) -> Result<MergedDocument<'doc, 'a, Mapper>> { doc_alloc: &'a Bump,
if self.has_deletion { ) -> Result<VectorDocumentFromDb<'a>> {
Ok(MergedDocument::without_db(self.new)) Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or(
} else { crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
MergedDocument::with_db(self.docid, rtxn, index, mapper, self.new) )?)
} }
pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> {
DocumentFromVersions::new(&self.new)
}
pub fn new<'t, Mapper: FieldIdMapper>(
&self,
rtxn: &'t RoTxn,
index: &'t Index,
mapper: &'t Mapper,
) -> Result<MergedDocument<'_, 'doc, 't, Mapper>> {
if self.has_deletion {
Ok(MergedDocument::without_db(DocumentFromVersions::new(&self.new)))
} else {
MergedDocument::with_db(
self.docid,
rtxn,
index,
mapper,
DocumentFromVersions::new(&self.new),
)
}
}
pub fn updated_vectors(
&self,
doc_alloc: &'doc Bump,
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
VectorDocumentFromVersions::new(&self.new, doc_alloc)
} }
} }