mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 03:55:07 +08:00
Extract the vectors from the documents
This commit is contained in:
parent
34349faeae
commit
7ac2f1489d
@ -47,6 +47,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
faceted_fields: HashSet<FieldId>,
|
faceted_fields: HashSet<FieldId>,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||||
|
vector_field_id: Option<FieldId>,
|
||||||
stop_words: Option<fst::Set<&[u8]>>,
|
stop_words: Option<fst::Set<&[u8]>>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
exact_attributes: HashSet<FieldId>,
|
exact_attributes: HashSet<FieldId>,
|
||||||
@ -71,6 +72,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
&faceted_fields,
|
&faceted_fields,
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
geo_fields_ids,
|
geo_fields_ids,
|
||||||
|
vector_field_id,
|
||||||
&stop_words,
|
&stop_words,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
)
|
)
|
||||||
@ -281,6 +283,7 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
faceted_fields: &HashSet<FieldId>,
|
faceted_fields: &HashSet<FieldId>,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||||
|
vector_field_id: Option<FieldId>,
|
||||||
stop_words: &Option<fst::Set<&[u8]>>,
|
stop_words: &Option<fst::Set<&[u8]>>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(
|
) -> Result<(
|
||||||
@ -309,6 +312,20 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(vector_field_id) = vector_field_id {
|
||||||
|
let documents_chunk_cloned = flattened_documents_chunk.clone();
|
||||||
|
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||||
|
rayon::spawn(move || {
|
||||||
|
let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id);
|
||||||
|
let _ = match result {
|
||||||
|
Ok(vector_points) => {
|
||||||
|
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
||||||
|
}
|
||||||
|
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||||
rayon::join(
|
rayon::join(
|
||||||
|| {
|
|| {
|
||||||
|
@ -304,6 +304,8 @@ where
|
|||||||
}
|
}
|
||||||
None => None,
|
None => None,
|
||||||
};
|
};
|
||||||
|
// get the fid of the `_vector` field.
|
||||||
|
let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector");
|
||||||
|
|
||||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||||
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
||||||
@ -340,6 +342,7 @@ where
|
|||||||
faceted_fields,
|
faceted_fields,
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
geo_fields_ids,
|
geo_fields_ids,
|
||||||
|
vector_field_id,
|
||||||
stop_words,
|
stop_words,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
exact_attributes,
|
exact_attributes,
|
||||||
|
@ -38,6 +38,7 @@ pub(crate) enum TypedChunk {
|
|||||||
FieldIdFacetIsNullDocids(grenad::Reader<File>),
|
FieldIdFacetIsNullDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
|
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
|
||||||
GeoPoints(grenad::Reader<File>),
|
GeoPoints(grenad::Reader<File>),
|
||||||
|
VectorPoints(grenad::Reader<File>),
|
||||||
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -221,6 +222,29 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
index.put_geo_rtree(wtxn, &rtree)?;
|
index.put_geo_rtree(wtxn, &rtree)?;
|
||||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||||
}
|
}
|
||||||
|
TypedChunk::VectorPoints(vector_points) => {
|
||||||
|
// let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
|
||||||
|
// let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
|
||||||
|
|
||||||
|
// let mut cursor = geo_points.into_cursor()?;
|
||||||
|
// while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
// // convert the key back to a u32 (4 bytes)
|
||||||
|
// let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||||
|
|
||||||
|
// // convert the latitude and longitude back to a f64 (8 bytes)
|
||||||
|
// let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
|
||||||
|
// let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
|
||||||
|
// let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
|
||||||
|
// let xyz_point = lat_lng_to_xyz(&point);
|
||||||
|
|
||||||
|
// rtree.insert(GeoPoint::new(xyz_point, (docid, point)));
|
||||||
|
// geo_faceted_docids.insert(docid);
|
||||||
|
// }
|
||||||
|
// index.put_geo_rtree(wtxn, &rtree)?;
|
||||||
|
// index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||||
|
|
||||||
|
todo!("index vector points")
|
||||||
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
for (key, value) in hash_pair {
|
for (key, value) in hash_pair {
|
||||||
|
Loading…
Reference in New Issue
Block a user