mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 10:07:40 +08:00
Create a new _vector extractor
This commit is contained in:
parent
f105df6599
commit
34349faeae
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -2683,6 +2683,7 @@ dependencies = [
|
|||||||
"bimap",
|
"bimap",
|
||||||
"bincode",
|
"bincode",
|
||||||
"bstr",
|
"bstr",
|
||||||
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"charabia",
|
"charabia",
|
||||||
"concat-arrays",
|
"concat-arrays",
|
||||||
|
@ -15,6 +15,7 @@ license.workspace = true
|
|||||||
bimap = { version = "0.6.3", features = ["serde"] }
|
bimap = { version = "0.6.3", features = ["serde"] }
|
||||||
bincode = "1.3.3"
|
bincode = "1.3.3"
|
||||||
bstr = "1.4.0"
|
bstr = "1.4.0"
|
||||||
|
bytemuck = "1.13.1"
|
||||||
byteorder = "1.4.3"
|
byteorder = "1.4.3"
|
||||||
charabia = { version = "0.7.2", default-features = false }
|
charabia = { version = "0.7.2", default-features = false }
|
||||||
concat-arrays = "0.1.2"
|
concat-arrays = "0.1.2"
|
||||||
|
@ -0,0 +1,40 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::io;
|
||||||
|
|
||||||
|
use bytemuck::cast_slice;
|
||||||
|
use serde_json::from_slice;
|
||||||
|
|
||||||
|
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||||
|
use crate::{FieldId, InternalError, Result};
|
||||||
|
|
||||||
|
/// Extracts the embedding vector contained in each document under the `_vector` field.
|
||||||
|
///
|
||||||
|
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
||||||
|
#[logging_timer::time]
|
||||||
|
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||||
|
obkv_documents: grenad::Reader<R>,
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
vector_fid: FieldId,
|
||||||
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
let mut writer = create_writer(
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
|
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||||
|
let obkv = obkv::KvReader::new(value);
|
||||||
|
|
||||||
|
// first we get the _vector field
|
||||||
|
if let Some(vector) = obkv.get(vector_fid) {
|
||||||
|
// try to extract the vector
|
||||||
|
let vector: Vec<f32> = from_slice(vector).map_err(InternalError::SerdeJson).unwrap();
|
||||||
|
let bytes = cast_slice(&vector);
|
||||||
|
writer.insert(docid_bytes, bytes)?;
|
||||||
|
}
|
||||||
|
// else => the _vector object was `null`, there is nothing to do
|
||||||
|
}
|
||||||
|
|
||||||
|
writer_into_reader(writer)
|
||||||
|
}
|
@ -4,6 +4,7 @@ mod extract_facet_string_docids;
|
|||||||
mod extract_fid_docid_facet_values;
|
mod extract_fid_docid_facet_values;
|
||||||
mod extract_fid_word_count_docids;
|
mod extract_fid_word_count_docids;
|
||||||
mod extract_geo_points;
|
mod extract_geo_points;
|
||||||
|
mod extract_vector_points;
|
||||||
mod extract_word_docids;
|
mod extract_word_docids;
|
||||||
mod extract_word_fid_docids;
|
mod extract_word_fid_docids;
|
||||||
mod extract_word_pair_proximity_docids;
|
mod extract_word_pair_proximity_docids;
|
||||||
@ -22,6 +23,7 @@ use self::extract_facet_string_docids::extract_facet_string_docids;
|
|||||||
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
|
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
|
||||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||||
use self::extract_geo_points::extract_geo_points;
|
use self::extract_geo_points::extract_geo_points;
|
||||||
|
use self::extract_vector_points::extract_vector_points;
|
||||||
use self::extract_word_docids::extract_word_docids;
|
use self::extract_word_docids::extract_word_docids;
|
||||||
use self::extract_word_fid_docids::extract_word_fid_docids;
|
use self::extract_word_fid_docids::extract_word_fid_docids;
|
||||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||||
|
Loading…
Reference in New Issue
Block a user