Extract the vectors from the documents

This commit is contained in:
Clément Renault 2023-06-08 11:51:55 +02:00
parent 34349faeae
commit 7ac2f1489d
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 44 additions and 0 deletions

View File

@ -47,6 +47,7 @@ pub(crate) fn data_from_obkv_documents(
faceted_fields: HashSet<FieldId>, faceted_fields: HashSet<FieldId>,
primary_key_id: FieldId, primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>, geo_fields_ids: Option<(FieldId, FieldId)>,
vector_field_id: Option<FieldId>,
stop_words: Option<fst::Set<&[u8]>>, stop_words: Option<fst::Set<&[u8]>>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
exact_attributes: HashSet<FieldId>, exact_attributes: HashSet<FieldId>,
@ -71,6 +72,7 @@ pub(crate) fn data_from_obkv_documents(
&faceted_fields, &faceted_fields,
primary_key_id, primary_key_id,
geo_fields_ids, geo_fields_ids,
vector_field_id,
&stop_words, &stop_words,
max_positions_per_attributes, max_positions_per_attributes,
) )
@ -281,6 +283,7 @@ fn send_and_extract_flattened_documents_data(
faceted_fields: &HashSet<FieldId>, faceted_fields: &HashSet<FieldId>,
primary_key_id: FieldId, primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>, geo_fields_ids: Option<(FieldId, FieldId)>,
vector_field_id: Option<FieldId>,
stop_words: &Option<fst::Set<&[u8]>>, stop_words: &Option<fst::Set<&[u8]>>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<( ) -> Result<(
@ -309,6 +312,20 @@ fn send_and_extract_flattened_documents_data(
}); });
} }
if let Some(vector_field_id) = vector_field_id {
let documents_chunk_cloned = flattened_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
rayon::spawn(move || {
let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id);
let _ = match result {
Ok(vector_points) => {
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
}
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
};
});
}
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
rayon::join( rayon::join(
|| { || {

View File

@ -304,6 +304,8 @@ where
} }
None => None, None => None,
}; };
// get the fid of the `_vector` field.
let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector");
let stop_words = self.index.stop_words(self.wtxn)?; let stop_words = self.index.stop_words(self.wtxn)?;
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
@ -340,6 +342,7 @@ where
faceted_fields, faceted_fields,
primary_key_id, primary_key_id,
geo_fields_ids, geo_fields_ids,
vector_field_id,
stop_words, stop_words,
max_positions_per_attributes, max_positions_per_attributes,
exact_attributes, exact_attributes,

View File

@ -38,6 +38,7 @@ pub(crate) enum TypedChunk {
FieldIdFacetIsNullDocids(grenad::Reader<File>), FieldIdFacetIsNullDocids(grenad::Reader<File>),
FieldIdFacetIsEmptyDocids(grenad::Reader<File>), FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
GeoPoints(grenad::Reader<File>), GeoPoints(grenad::Reader<File>),
VectorPoints(grenad::Reader<File>),
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
} }
@ -221,6 +222,29 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_rtree(wtxn, &rtree)?;
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
} }
TypedChunk::VectorPoints(vector_points) => {
// let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
// let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
// let mut cursor = geo_points.into_cursor()?;
// while let Some((key, value)) = cursor.move_on_next()? {
// // convert the key back to a u32 (4 bytes)
// let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
// // convert the latitude and longitude back to a f64 (8 bytes)
// let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
// let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
// let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
// let xyz_point = lat_lng_to_xyz(&point);
// rtree.insert(GeoPoint::new(xyz_point, (docid, point)));
// geo_faceted_docids.insert(docid);
// }
// index.put_geo_rtree(wtxn, &rtree)?;
// index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
todo!("index vector points")
}
TypedChunk::ScriptLanguageDocids(hash_pair) => { TypedChunk::ScriptLanguageDocids(hash_pair) => {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
for (key, value) in hash_pair { for (key, value) in hash_pair {