diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 128fc29c0..fdc6f5616 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -47,6 +47,7 @@ pub(crate) fn data_from_obkv_documents( faceted_fields: HashSet, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, + vector_field_id: Option, stop_words: Option>, max_positions_per_attributes: Option, exact_attributes: HashSet, @@ -71,6 +72,7 @@ pub(crate) fn data_from_obkv_documents( &faceted_fields, primary_key_id, geo_fields_ids, + vector_field_id, &stop_words, max_positions_per_attributes, ) @@ -281,6 +283,7 @@ fn send_and_extract_flattened_documents_data( faceted_fields: &HashSet, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, + vector_field_id: Option, stop_words: &Option>, max_positions_per_attributes: Option, ) -> Result<( @@ -309,6 +312,20 @@ fn send_and_extract_flattened_documents_data( }); } + if let Some(vector_field_id) = vector_field_id { + let documents_chunk_cloned = flattened_documents_chunk.clone(); + let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); + rayon::spawn(move || { + let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id); + let _ = match result { + Ok(vector_points) => { + lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) + } + Err(error) => lmdb_writer_sx_cloned.send(Err(error)), + }; + }); + } + let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 993f87a1f..adbab54db 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -304,6 +304,8 @@ where } None => None, }; + // get the fid of the `_vector` field. + let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector"); let stop_words = self.index.stop_words(self.wtxn)?; let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; @@ -340,6 +342,7 @@ where faceted_fields, primary_key_id, geo_fields_ids, + vector_field_id, stop_words, max_positions_per_attributes, exact_attributes, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 89b10bffe..8b3477948 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -38,6 +38,7 @@ pub(crate) enum TypedChunk { FieldIdFacetIsNullDocids(grenad::Reader), FieldIdFacetIsEmptyDocids(grenad::Reader), GeoPoints(grenad::Reader), + VectorPoints(grenad::Reader), ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), } @@ -221,6 +222,29 @@ pub(crate) fn write_typed_chunk_into_index( index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } + TypedChunk::VectorPoints(vector_points) => { + // let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); + // let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; + + // let mut cursor = geo_points.into_cursor()?; + // while let Some((key, value)) = cursor.move_on_next()? { + // // convert the key back to a u32 (4 bytes) + // let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); + + // // convert the latitude and longitude back to a f64 (8 bytes) + // let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); + // let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); + // let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; + // let xyz_point = lat_lng_to_xyz(&point); + + // rtree.insert(GeoPoint::new(xyz_point, (docid, point))); + // geo_faceted_docids.insert(docid); + // } + // index.put_geo_rtree(wtxn, &rtree)?; + // index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; + + todo!("index vector points") + } TypedChunk::ScriptLanguageDocids(hash_pair) => { let mut buffer = Vec::new(); for (key, value) in hash_pair {