mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
WIP
This commit is contained in:
parent
f6483cf15d
commit
afa3ae0cbd
@ -689,9 +689,8 @@ where
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
let first_id = crate::vector::arroy_db_range_for_embedder(index).next().unwrap();
|
||||
let reader =
|
||||
ArroyWrapper::new(self.index.vector_arroy, first_id, action.was_quantized);
|
||||
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
|
||||
let dim = reader.dimensions(self.wtxn)?;
|
||||
dimension.insert(name.to_string(), dim);
|
||||
}
|
||||
@ -713,17 +712,11 @@ where
|
||||
let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized);
|
||||
|
||||
pool.install(|| {
|
||||
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
|
||||
let mut writer = ArroyWrapper::new(vector_arroy, k, was_quantized);
|
||||
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
|
||||
if is_quantizing {
|
||||
writer.quantize(wtxn, k, dimension)?;
|
||||
writer.quantize(wtxn, dimension)?;
|
||||
}
|
||||
if writer.need_build(wtxn, dimension)? {
|
||||
writer.build(wtxn, &mut rng, dimension)?;
|
||||
} else if writer.is_empty(wtxn, dimension)? {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Result::Ok(())
|
||||
})
|
||||
.map_err(InternalError::from)??;
|
||||
|
@ -673,22 +673,14 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
.get(&embedder_name)
|
||||
.map_or(false, |conf| conf.2);
|
||||
// FIXME: allow customizing distance
|
||||
let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index)
|
||||
.map(|k| ArroyWrapper::new(index.vector_arroy, k, binary_quantized))
|
||||
.collect();
|
||||
let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized);
|
||||
|
||||
// remove vectors for docids we want them removed
|
||||
let merger = remove_vectors_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, _)) = iter.next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
for writer in &writers {
|
||||
// Uses invariant: vectors are packed in the first writers.
|
||||
if !writer.del_item(wtxn, expected_dimension, docid)? {
|
||||
break;
|
||||
}
|
||||
}
|
||||
writer.del_item(wtxn, expected_dimension, docid)?;
|
||||
}
|
||||
|
||||
// add generated embeddings
|
||||
@ -716,9 +708,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
embeddings.embedding_count(),
|
||||
)));
|
||||
}
|
||||
for (embedding, writer) in embeddings.iter().zip(&writers) {
|
||||
writer.add_item(wtxn, expected_dimension, docid, embedding)?;
|
||||
}
|
||||
writer.add_items(wtxn, expected_dimension, docid, embeddings)?;
|
||||
}
|
||||
|
||||
// perform the manual diff
|
||||
|
@ -32,61 +32,70 @@ pub const REQUEST_PARALLELISM: usize = 40;
|
||||
|
||||
pub struct ArroyWrapper {
|
||||
quantized: bool,
|
||||
index: u16,
|
||||
index: u8,
|
||||
database: arroy::Database<Unspecified>,
|
||||
}
|
||||
|
||||
impl ArroyWrapper {
|
||||
pub fn new(database: arroy::Database<Unspecified>, index: u16, quantized: bool) -> Self {
|
||||
pub fn new(database: arroy::Database<Unspecified>, index: u8, quantized: bool) -> Self {
|
||||
Self { database, index, quantized }
|
||||
}
|
||||
|
||||
pub fn index(&self) -> u16 {
|
||||
pub fn index(&self) -> u8 {
|
||||
self.index
|
||||
}
|
||||
|
||||
pub fn dimensions(&self, rtxn: &RoTxn) -> Result<usize, arroy::Error> {
|
||||
let first_id = arroy_db_range_for_embedder(self.index).next().unwrap();
|
||||
if self.quantized {
|
||||
Ok(arroy::Reader::open(rtxn, self.index, self.quantized_db())?.dimensions())
|
||||
Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions())
|
||||
} else {
|
||||
Ok(arroy::Reader::open(rtxn, self.index, self.angular_db())?.dimensions())
|
||||
Ok(arroy::Reader::open(rtxn, first_id, self.angular_db())?.dimensions())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn quantize(
|
||||
&mut self,
|
||||
wtxn: &mut RwTxn,
|
||||
index: u16,
|
||||
dimension: usize,
|
||||
) -> Result<(), arroy::Error> {
|
||||
pub fn quantize(&mut self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> {
|
||||
if !self.quantized {
|
||||
for index in arroy_db_range_for_embedder(self.index) {
|
||||
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
||||
writer.prepare_changing_distance::<BinaryQuantizedAngular>(wtxn)?;
|
||||
}
|
||||
self.quantized = true;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO: We can stop early when we find an empty DB
|
||||
pub fn need_build(&self, rtxn: &RoTxn, dimension: usize) -> Result<bool, arroy::Error> {
|
||||
if self.quantized {
|
||||
arroy::Writer::new(self.quantized_db(), self.index, dimension).need_build(rtxn)
|
||||
for index in arroy_db_range_for_embedder(self.index) {
|
||||
let need_build = if self.quantized {
|
||||
arroy::Writer::new(self.quantized_db(), index, dimension).need_build(rtxn)
|
||||
} else {
|
||||
arroy::Writer::new(self.angular_db(), self.index, dimension).need_build(rtxn)
|
||||
arroy::Writer::new(self.angular_db(), index, dimension).need_build(rtxn)
|
||||
};
|
||||
if need_build? {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
/// TODO: We should early exit when it doesn't need to be built
|
||||
pub fn build<R: rand::Rng + rand::SeedableRng>(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
rng: &mut R,
|
||||
dimension: usize,
|
||||
) -> Result<(), arroy::Error> {
|
||||
for index in arroy_db_range_for_embedder(self.index) {
|
||||
if self.quantized {
|
||||
arroy::Writer::new(self.quantized_db(), self.index, dimension).build(wtxn, rng, None)
|
||||
arroy::Writer::new(self.quantized_db(), index, dimension).build(wtxn, rng, None)?
|
||||
} else {
|
||||
arroy::Writer::new(self.angular_db(), self.index, dimension).build(wtxn, rng, None)
|
||||
arroy::Writer::new(self.angular_db(), index, dimension).build(wtxn, rng, None)?
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add_item(
|
||||
&self,
|
||||
|
Loading…
Reference in New Issue
Block a user