mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 02:27:40 +08:00
WIP
This commit is contained in:
parent
dde3a04679
commit
cb4ebe163e
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -383,7 +383,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "arroy"
|
name = "arroy"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/meilisearch/arroy.git#4b59476f457e5443ff250ea10d40d8b66a692674"
|
source = "git+https://github.com/meilisearch/arroy.git#0079af0ec960bc9c51dd66e898a6b5e980cbb083"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck",
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
|
@ -61,6 +61,8 @@ pub enum InternalError {
|
|||||||
AbortedIndexation,
|
AbortedIndexation,
|
||||||
#[error("The matching words list contains at least one invalid member.")]
|
#[error("The matching words list contains at least one invalid member.")]
|
||||||
InvalidMatchingWords,
|
InvalidMatchingWords,
|
||||||
|
#[error(transparent)]
|
||||||
|
ArroyError(#[from] arroy::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
@ -190,6 +192,24 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
|||||||
InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError),
|
InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<arroy::Error> for Error {
|
||||||
|
fn from(value: arroy::Error) -> Self {
|
||||||
|
match value {
|
||||||
|
arroy::Error::Heed(heed) => heed.into(),
|
||||||
|
arroy::Error::Io(io) => io.into(),
|
||||||
|
arroy::Error::InvalidVecDimension { expected, received } => {
|
||||||
|
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
|
||||||
|
}
|
||||||
|
arroy::Error::DatabaseFull
|
||||||
|
| arroy::Error::InvalidItemAppend
|
||||||
|
| arroy::Error::UnmatchingDistance { .. }
|
||||||
|
| arroy::Error::MissingMetadata => {
|
||||||
|
Error::InternalError(InternalError::ArroyError(value))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum GeoError {
|
pub enum GeoError {
|
||||||
#[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]
|
#[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]
|
||||||
|
@ -70,7 +70,6 @@ pub mod main_key {
|
|||||||
pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by";
|
pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by";
|
||||||
pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits";
|
pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits";
|
||||||
pub const PROXIMITY_PRECISION: &str = "proximity-precision";
|
pub const PROXIMITY_PRECISION: &str = "proximity-precision";
|
||||||
pub const VECTOR_UNAVAILABLE_VECTOR_IDS: &str = "vector-unavailable-vector-ids";
|
|
||||||
pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
|
pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,8 +96,6 @@ pub mod db_name {
|
|||||||
pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
|
pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
|
||||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||||
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
|
|
||||||
pub const VECTOR_DOCID_IDS: &str = "vector-docid-ids";
|
|
||||||
pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id";
|
pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id";
|
||||||
pub const VECTOR_ARROY: &str = "vector-arroy";
|
pub const VECTOR_ARROY: &str = "vector-arroy";
|
||||||
pub const DOCUMENTS: &str = "documents";
|
pub const DOCUMENTS: &str = "documents";
|
||||||
@ -167,16 +164,10 @@ pub struct Index {
|
|||||||
/// Maps the document id, the facet field id and the strings.
|
/// Maps the document id, the facet field id and the strings.
|
||||||
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
||||||
|
|
||||||
/// Maps a vector id to its document id.
|
|
||||||
pub vector_id_docid: Database<BEU32, BEU32>,
|
|
||||||
/// Maps a doc id to its vector ids.
|
|
||||||
pub docid_vector_ids: Database<BEU32, CboRoaringBitmapCodec>,
|
|
||||||
|
|
||||||
/// Maps an embedder name to its id in the arroy store.
|
/// Maps an embedder name to its id in the arroy store.
|
||||||
pub embedder_category_id: Database<Str, BEU16>,
|
pub embedder_category_id: Database<Str, U8>,
|
||||||
|
|
||||||
/// Vector store based on arroy™.
|
/// Vector store based on arroy™.
|
||||||
pub vector_arroy: arroy::Database<arroy::distances::DotProduct>,
|
pub vector_arroy: arroy::Database<arroy::distances::Angular>,
|
||||||
|
|
||||||
/// Maps the document id to the document as an obkv store.
|
/// Maps the document id to the document as an obkv store.
|
||||||
pub(crate) documents: Database<BEU32, ObkvCodec>,
|
pub(crate) documents: Database<BEU32, ObkvCodec>,
|
||||||
@ -191,7 +182,7 @@ impl Index {
|
|||||||
) -> Result<Index> {
|
) -> Result<Index> {
|
||||||
use db_name::*;
|
use db_name::*;
|
||||||
|
|
||||||
options.max_dbs(27);
|
options.max_dbs(25);
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
let mut wtxn = env.write_txn()?;
|
let mut wtxn = env.write_txn()?;
|
||||||
@ -232,8 +223,6 @@ impl Index {
|
|||||||
let field_id_docid_facet_strings =
|
let field_id_docid_facet_strings =
|
||||||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
||||||
// vector stuff
|
// vector stuff
|
||||||
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
|
|
||||||
let docid_vector_ids = env.create_database(&mut wtxn, Some(VECTOR_DOCID_IDS))?;
|
|
||||||
let embedder_category_id =
|
let embedder_category_id =
|
||||||
env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
|
env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
|
||||||
let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?;
|
let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?;
|
||||||
@ -267,9 +256,7 @@ impl Index {
|
|||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
vector_id_docid,
|
|
||||||
vector_arroy,
|
vector_arroy,
|
||||||
docid_vector_ids,
|
|
||||||
embedder_category_id,
|
embedder_category_id,
|
||||||
documents,
|
documents,
|
||||||
})
|
})
|
||||||
@ -1516,30 +1503,6 @@ impl Index {
|
|||||||
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
||||||
.unwrap_or_default())
|
.unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn put_unavailable_vector_ids(
|
|
||||||
&self,
|
|
||||||
wtxn: &mut RwTxn<'_>,
|
|
||||||
unavailable_vector_ids: RoaringBitmap,
|
|
||||||
) -> heed::Result<()> {
|
|
||||||
self.main.remap_types::<Str, CboRoaringBitmapCodec>().put(
|
|
||||||
wtxn,
|
|
||||||
main_key::VECTOR_UNAVAILABLE_VECTOR_IDS,
|
|
||||||
&unavailable_vector_ids,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn delete_unavailable_vector_ids(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
|
||||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn unavailable_vector_ids(&self, rtxn: &RoTxn<'_>) -> Result<RoaringBitmap> {
|
|
||||||
Ok(self
|
|
||||||
.main
|
|
||||||
.remap_types::<Str, CboRoaringBitmapCodec>()
|
|
||||||
.get(rtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)?
|
|
||||||
.unwrap_or_default())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -262,6 +262,7 @@ fn get_ranking_rules_for_vector<'ctx>(
|
|||||||
ctx: &SearchContext<'ctx>,
|
ctx: &SearchContext<'ctx>,
|
||||||
sort_criteria: &Option<Vec<AscDesc>>,
|
sort_criteria: &Option<Vec<AscDesc>>,
|
||||||
geo_strategy: geo_sort::Strategy,
|
geo_strategy: geo_sort::Strategy,
|
||||||
|
limit_plus_offset: usize,
|
||||||
target: &[f32],
|
target: &[f32],
|
||||||
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
|
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
|
||||||
// query graph search
|
// query graph search
|
||||||
@ -283,7 +284,12 @@ fn get_ranking_rules_for_vector<'ctx>(
|
|||||||
| crate::Criterion::Exactness => {
|
| crate::Criterion::Exactness => {
|
||||||
if !vector {
|
if !vector {
|
||||||
let vector_candidates = ctx.index.documents_ids(ctx.txn)?;
|
let vector_candidates = ctx.index.documents_ids(ctx.txn)?;
|
||||||
let vector_sort = VectorSort::new(ctx, target.to_vec(), vector_candidates)?;
|
let vector_sort = VectorSort::new(
|
||||||
|
ctx,
|
||||||
|
target.to_vec(),
|
||||||
|
vector_candidates,
|
||||||
|
limit_plus_offset,
|
||||||
|
)?;
|
||||||
ranking_rules.push(Box::new(vector_sort));
|
ranking_rules.push(Box::new(vector_sort));
|
||||||
vector = true;
|
vector = true;
|
||||||
}
|
}
|
||||||
@ -509,7 +515,8 @@ pub fn execute_vector_search(
|
|||||||
|
|
||||||
/// FIXME: input universe = universe & documents_with_vectors
|
/// FIXME: input universe = universe & documents_with_vectors
|
||||||
// for now if we're computing embeddings for ALL documents, we can assume that this is just universe
|
// for now if we're computing embeddings for ALL documents, we can assume that this is just universe
|
||||||
let ranking_rules = get_ranking_rules_for_vector(ctx, sort_criteria, geo_strategy, vector)?;
|
let ranking_rules =
|
||||||
|
get_ranking_rules_for_vector(ctx, sort_criteria, geo_strategy, from + length, vector)?;
|
||||||
|
|
||||||
let mut placeholder_search_logger = logger::DefaultSearchLogger;
|
let mut placeholder_search_logger = logger::DefaultSearchLogger;
|
||||||
let placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery> =
|
let placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery> =
|
||||||
|
@ -1,48 +1,83 @@
|
|||||||
use std::future::Future;
|
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
use std::pin::Pin;
|
|
||||||
|
|
||||||
use nolife::DynBoxScope;
|
use ordered_float::OrderedFloat;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||||
use crate::distance::NDotProductPoint;
|
|
||||||
use crate::index::Hnsw;
|
|
||||||
use crate::score_details::{self, ScoreDetails};
|
use crate::score_details::{self, ScoreDetails};
|
||||||
use crate::{Result, SearchContext, SearchLogger, UserError};
|
use crate::{DocumentId, Result, SearchContext, SearchLogger};
|
||||||
|
|
||||||
pub struct VectorSort<'ctx, Q: RankingRuleQueryTrait> {
|
pub struct VectorSort<Q: RankingRuleQueryTrait> {
|
||||||
query: Option<Q>,
|
query: Option<Q>,
|
||||||
target: Vec<f32>,
|
target: Vec<f32>,
|
||||||
vector_candidates: RoaringBitmap,
|
vector_candidates: RoaringBitmap,
|
||||||
reader: arroy::Reader<'ctx, arroy::distances::DotProduct>,
|
cached_sorted_docids: std::vec::IntoIter<(DocumentId, f32, Vec<f32>)>,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'ctx, Q: RankingRuleQueryTrait> VectorSort<'ctx, Q> {
|
impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
ctx: &'ctx SearchContext,
|
_ctx: &SearchContext,
|
||||||
target: Vec<f32>,
|
target: Vec<f32>,
|
||||||
vector_candidates: RoaringBitmap,
|
vector_candidates: RoaringBitmap,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
/// FIXME? what to do in case of missing metadata
|
Ok(Self {
|
||||||
let reader = arroy::Reader::open(ctx.txn, 0, ctx.index.vector_arroy)?;
|
query: None,
|
||||||
|
target,
|
||||||
|
vector_candidates,
|
||||||
|
cached_sorted_docids: Default::default(),
|
||||||
|
limit,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
let target_clone = target.clone();
|
fn fill_buffer(&mut self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||||
|
let readers: std::result::Result<Vec<_>, _> = (0..=u8::MAX)
|
||||||
|
.map_while(|k| {
|
||||||
|
arroy::Reader::open(ctx.txn, k.into(), ctx.index.vector_arroy)
|
||||||
|
.map(Some)
|
||||||
|
.or_else(|e| match e {
|
||||||
|
arroy::Error::MissingMetadata => Ok(None),
|
||||||
|
e => Err(e),
|
||||||
|
})
|
||||||
|
.transpose()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
Ok(Self { query: None, target, vector_candidates, reader, limit })
|
let readers = readers?;
|
||||||
|
|
||||||
|
let target = &self.target;
|
||||||
|
let mut results = Vec::new();
|
||||||
|
|
||||||
|
for reader in readers.iter() {
|
||||||
|
let nns_by_vector = reader.nns_by_vector(
|
||||||
|
ctx.txn,
|
||||||
|
&target,
|
||||||
|
self.limit,
|
||||||
|
None,
|
||||||
|
Some(&self.vector_candidates),
|
||||||
|
)?;
|
||||||
|
let vectors: std::result::Result<Vec<_>, _> = nns_by_vector
|
||||||
|
.iter()
|
||||||
|
.map(|(docid, _)| reader.item_vector(ctx.txn, *docid).transpose().unwrap())
|
||||||
|
.collect();
|
||||||
|
let vectors = vectors?;
|
||||||
|
results.extend(nns_by_vector.into_iter().zip(vectors).map(|((x, y), z)| (x, y, z)));
|
||||||
|
}
|
||||||
|
results.sort_unstable_by_key(|(_, distance, _)| OrderedFloat(*distance));
|
||||||
|
self.cached_sorted_docids = results.into_iter();
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<'ctx, Q> {
|
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<Q> {
|
||||||
fn id(&self) -> String {
|
fn id(&self) -> String {
|
||||||
"vector_sort".to_owned()
|
"vector_sort".to_owned()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn start_iteration(
|
fn start_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
_ctx: &mut SearchContext<'ctx>,
|
ctx: &mut SearchContext<'ctx>,
|
||||||
_logger: &mut dyn SearchLogger<Q>,
|
_logger: &mut dyn SearchLogger<Q>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
query: &Q,
|
query: &Q,
|
||||||
@ -51,7 +86,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<'ctx, Q
|
|||||||
|
|
||||||
self.query = Some(query.clone());
|
self.query = Some(query.clone());
|
||||||
self.vector_candidates &= universe;
|
self.vector_candidates &= universe;
|
||||||
|
self.fill_buffer(ctx)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -75,40 +110,24 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<'ctx, Q
|
|||||||
}),
|
}),
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
let target = &self.target;
|
|
||||||
let vector_candidates = &self.vector_candidates;
|
|
||||||
|
|
||||||
let result = self.reader.nns_by_vector(ctx.txn, &target, count, search_k, candidates)
|
while let Some((docid, distance, vector)) = self.cached_sorted_docids.next() {
|
||||||
|
if self.vector_candidates.contains(docid) {
|
||||||
scope.enter(|it| {
|
|
||||||
for item in it.by_ref() {
|
|
||||||
let item: Item = item;
|
|
||||||
let index = item.pid.into_inner();
|
|
||||||
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap();
|
|
||||||
|
|
||||||
if vector_candidates.contains(docid) {
|
|
||||||
return Ok(Some(RankingRuleOutput {
|
return Ok(Some(RankingRuleOutput {
|
||||||
query,
|
query,
|
||||||
candidates: RoaringBitmap::from_iter([docid]),
|
candidates: RoaringBitmap::from_iter([docid]),
|
||||||
score: ScoreDetails::Vector(score_details::Vector {
|
score: ScoreDetails::Vector(score_details::Vector {
|
||||||
target_vector: target.clone(),
|
target_vector: self.target.clone(),
|
||||||
value_similarity: Some((
|
value_similarity: Some((vector, 1.0 - distance)),
|
||||||
item.point.clone().into_inner(),
|
|
||||||
1.0 - item.distance,
|
|
||||||
)),
|
|
||||||
}),
|
}),
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(Some(RankingRuleOutput {
|
|
||||||
query,
|
// if we got out of this loop it means we've exhausted our cache.
|
||||||
candidates: universe.clone(),
|
// we need to refill it and run the function again.
|
||||||
score: ScoreDetails::Vector(score_details::Vector {
|
self.fill_buffer(ctx)?;
|
||||||
target_vector: target.clone(),
|
self.next_bucket(ctx, _logger, universe)
|
||||||
value_similarity: None,
|
|
||||||
}),
|
|
||||||
}))
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
|
fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
|
||||||
|
@ -42,9 +42,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
|
|||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
vector_id_docid,
|
|
||||||
vector_arroy,
|
vector_arroy,
|
||||||
docid_vector_ids,
|
|
||||||
embedder_category_id: _,
|
embedder_category_id: _,
|
||||||
documents,
|
documents,
|
||||||
} = self.index;
|
} = self.index;
|
||||||
@ -86,8 +84,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
|
|||||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||||
// vector
|
// vector
|
||||||
vector_arroy.clear(self.wtxn)?;
|
vector_arroy.clear(self.wtxn)?;
|
||||||
vector_id_docid.clear(self.wtxn)?;
|
|
||||||
docid_vector_ids.clear(self.wtxn)?;
|
|
||||||
|
|
||||||
documents.clear(self.wtxn)?;
|
documents.clear(self.wtxn)?;
|
||||||
|
|
||||||
|
@ -418,7 +418,7 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
// needs to be dropped to avoid channel waiting lock.
|
// needs to be dropped to avoid channel waiting lock.
|
||||||
drop(lmdb_writer_sx)
|
drop(lmdb_writer_sx);
|
||||||
});
|
});
|
||||||
|
|
||||||
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
|
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
|
||||||
@ -435,6 +435,8 @@ where
|
|||||||
let mut word_docids = None;
|
let mut word_docids = None;
|
||||||
let mut exact_word_docids = None;
|
let mut exact_word_docids = None;
|
||||||
|
|
||||||
|
let mut dimension = None;
|
||||||
|
|
||||||
for result in lmdb_writer_rx {
|
for result in lmdb_writer_rx {
|
||||||
if (self.should_abort)() {
|
if (self.should_abort)() {
|
||||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||||
@ -464,6 +466,20 @@ where
|
|||||||
word_position_docids = Some(cloneable_chunk);
|
word_position_docids = Some(cloneable_chunk);
|
||||||
TypedChunk::WordPositionDocids(chunk)
|
TypedChunk::WordPositionDocids(chunk)
|
||||||
}
|
}
|
||||||
|
TypedChunk::VectorPoints {
|
||||||
|
expected_dimension,
|
||||||
|
remove_vectors,
|
||||||
|
embeddings,
|
||||||
|
manual_vectors,
|
||||||
|
} => {
|
||||||
|
dimension = Some(expected_dimension);
|
||||||
|
TypedChunk::VectorPoints {
|
||||||
|
remove_vectors,
|
||||||
|
embeddings,
|
||||||
|
expected_dimension,
|
||||||
|
manual_vectors,
|
||||||
|
}
|
||||||
|
}
|
||||||
otherwise => otherwise,
|
otherwise => otherwise,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -490,9 +506,6 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let writer = arroy::Writer::prepare(self.wtxn, self.index.vector_arroy, 0, 0)?;
|
|
||||||
writer.build(self.wtxn, &mut rand::rngs::StdRng::from_entropy(), None)?;
|
|
||||||
|
|
||||||
// We write the field distribution into the main database
|
// We write the field distribution into the main database
|
||||||
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
|
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
|
||||||
|
|
||||||
@ -500,6 +513,23 @@ where
|
|||||||
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
||||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||||
|
|
||||||
|
if let Some(dimension) = dimension {
|
||||||
|
let wtxn = &mut *self.wtxn;
|
||||||
|
let vector_arroy = self.index.vector_arroy;
|
||||||
|
pool.install(|| {
|
||||||
|
/// FIXME: do for each embedder
|
||||||
|
let mut rng = rand::rngs::StdRng::from_entropy();
|
||||||
|
for k in 0..=u8::MAX {
|
||||||
|
let writer = arroy::Writer::prepare(wtxn, vector_arroy, k.into(), dimension)?;
|
||||||
|
if writer.is_empty(wtxn)? {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
writer.build(wtxn, &mut rng, None)?;
|
||||||
|
}
|
||||||
|
Result::Ok(())
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
|
||||||
self.execute_prefix_databases(
|
self.execute_prefix_databases(
|
||||||
word_docids,
|
word_docids,
|
||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
|
@ -8,9 +8,7 @@ use charabia::{Language, Script};
|
|||||||
use grenad::MergerBuilder;
|
use grenad::MergerBuilder;
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
use heed::{PutFlags, RwTxn};
|
use heed::{PutFlags, RwTxn};
|
||||||
use log::error;
|
|
||||||
use obkv::{KvReader, KvWriter};
|
use obkv::{KvReader, KvWriter};
|
||||||
use ordered_float::OrderedFloat;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
@ -18,16 +16,12 @@ use super::helpers::{
|
|||||||
valid_lmdb_key, CursorClonableMmap,
|
valid_lmdb_key, CursorClonableMmap,
|
||||||
};
|
};
|
||||||
use super::{ClonableMmap, MergeFn};
|
use super::{ClonableMmap, MergeFn};
|
||||||
use crate::distance::NDotProductPoint;
|
|
||||||
use crate::error::UserError;
|
|
||||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::index::db_name::DOCUMENTS;
|
use crate::index::db_name::DOCUMENTS;
|
||||||
use crate::index::Hnsw;
|
|
||||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||||
use crate::update::facet::FacetsUpdate;
|
use crate::update::facet::FacetsUpdate;
|
||||||
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
||||||
use crate::update::{available_documents_ids, AvailableDocumentsIds};
|
|
||||||
use crate::{lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError};
|
use crate::{lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError};
|
||||||
|
|
||||||
pub(crate) enum TypedChunk {
|
pub(crate) enum TypedChunk {
|
||||||
@ -374,28 +368,28 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
return Ok((RoaringBitmap::new(), is_merged_database));
|
return Ok((RoaringBitmap::new(), is_merged_database));
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut unavailable_vector_ids = index.unavailable_vector_ids(&wtxn)?;
|
|
||||||
/// FIXME: allow customizing distance
|
/// FIXME: allow customizing distance
|
||||||
/// FIXME: allow customizing index
|
let writers: std::result::Result<Vec<_>, _> = (0..=u8::MAX)
|
||||||
let writer = arroy::Writer::prepare(wtxn, index.vector_arroy, 0, expected_dimension)?;
|
.map(|k| {
|
||||||
|
/// FIXME: allow customizing index and then do index << 8 + k
|
||||||
|
arroy::Writer::prepare(wtxn, index.vector_arroy, k.into(), expected_dimension)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let writers = writers?;
|
||||||
|
|
||||||
// remove vectors for docids we want them removed
|
// remove vectors for docids we want them removed
|
||||||
let mut cursor = remove_vectors.into_cursor()?;
|
let mut cursor = remove_vectors.into_cursor()?;
|
||||||
while let Some((key, _)) = cursor.move_on_next()? {
|
while let Some((key, _)) = cursor.move_on_next()? {
|
||||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||||
|
|
||||||
let Some(to_remove_vector_ids) = index.docid_vector_ids.get(&wtxn, &docid)? else {
|
for writer in &writers {
|
||||||
continue;
|
// Uses invariant: vectors are packed in the first writers.
|
||||||
};
|
if !writer.del_item(wtxn, docid)? {
|
||||||
unavailable_vector_ids -= to_remove_vector_ids;
|
break;
|
||||||
|
}
|
||||||
for item in to_remove_vector_ids {
|
|
||||||
writer.del_item(wtxn, item)?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut available_vector_ids =
|
|
||||||
AvailableDocumentsIds::from_documents_ids(&unavailable_vector_ids);
|
|
||||||
// add generated embeddings
|
// add generated embeddings
|
||||||
if let Some(embeddings) = embeddings {
|
if let Some(embeddings) = embeddings {
|
||||||
let mut cursor = embeddings.into_cursor()?;
|
let mut cursor = embeddings.into_cursor()?;
|
||||||
@ -408,19 +402,10 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
// code error if we somehow got the wrong dimension
|
// code error if we somehow got the wrong dimension
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let mut new_vector_ids = RoaringBitmap::new();
|
/// FIXME: detect overflow
|
||||||
for embedding in embeddings.iter() {
|
for (embedding, writer) in embeddings.iter().zip(&writers) {
|
||||||
/// FIXME: error when you get over 9000
|
writer.add_item(wtxn, docid, embedding)?;
|
||||||
let next_vector_id = available_vector_ids.next().unwrap();
|
|
||||||
unavailable_vector_ids.insert(next_vector_id);
|
|
||||||
|
|
||||||
new_vector_ids.insert(next_vector_id);
|
|
||||||
|
|
||||||
index.vector_id_docid.put(wtxn, &next_vector_id, &docid)?;
|
|
||||||
|
|
||||||
writer.add_item(wtxn, next_vector_id, embedding)?;
|
|
||||||
}
|
}
|
||||||
index.docid_vector_ids.put(wtxn, &docid, &new_vector_ids)?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -433,44 +418,52 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
|
|
||||||
let vector_deladd_obkv = KvReaderDelAdd::new(value);
|
let vector_deladd_obkv = KvReaderDelAdd::new(value);
|
||||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
||||||
let vector = pod_collect_to_vec(value);
|
let vector: Vec<f32> = pod_collect_to_vec(value);
|
||||||
let Some(mut docid_vector_ids) = index.docid_vector_ids.get(&wtxn, &docid)?
|
|
||||||
else {
|
let mut deleted_index = None;
|
||||||
error!("Unable to delete the vector: {:?}", vector);
|
for (index, writer) in writers.iter().enumerate() {
|
||||||
continue;
|
let Some(candidate) = writer.item_vector(&wtxn, docid)? else {
|
||||||
};
|
// uses invariant: vectors are packed in the first writers.
|
||||||
for item in docid_vector_ids {
|
|
||||||
/// FIXME: comparing the vectors by equality is inefficient, and dangerous by perfect equality
|
|
||||||
let candidate = writer.item_vector(&wtxn, item)?.expect("Inconsistent dbs");
|
|
||||||
if candidate == vector {
|
|
||||||
writer.del_item(wtxn, item)?;
|
|
||||||
unavailable_vector_ids.remove(item);
|
|
||||||
index.vector_id_docid.delete(wtxn, &item)?;
|
|
||||||
docid_vector_ids.remove(item);
|
|
||||||
break;
|
break;
|
||||||
|
};
|
||||||
|
if candidate == vector {
|
||||||
|
writer.del_item(wtxn, docid)?;
|
||||||
|
deleted_index = Some(index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 🥲 enforce invariant: vectors are packed in the first writers.
|
||||||
|
if let Some(deleted_index) = deleted_index {
|
||||||
|
let mut last_index_with_a_vector = None;
|
||||||
|
for (index, writer) in writers.iter().enumerate().skip(deleted_index) {
|
||||||
|
let Some(candidate) = writer.item_vector(&wtxn, docid)? else {
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
last_index_with_a_vector = Some((index, candidate));
|
||||||
|
}
|
||||||
|
if let Some((last_index, vector)) = last_index_with_a_vector {
|
||||||
|
// unwrap: computed the index from the list of writers
|
||||||
|
let writer = writers.get(last_index).unwrap();
|
||||||
|
writer.del_item(wtxn, docid)?;
|
||||||
|
writers.get(deleted_index).unwrap().add_item(wtxn, docid, &vector)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
index.docid_vector_ids.put(wtxn, &docid, &docid_vector_ids)?;
|
|
||||||
}
|
}
|
||||||
let mut available_vector_ids =
|
|
||||||
AvailableDocumentsIds::from_documents_ids(&unavailable_vector_ids);
|
|
||||||
|
|
||||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
||||||
let vector = pod_collect_to_vec(value);
|
let vector = pod_collect_to_vec(value);
|
||||||
let next_vector_id = available_vector_ids.next().unwrap();
|
|
||||||
|
|
||||||
writer.add_item(wtxn, next_vector_id, &vector)?;
|
/// FIXME: detect overflow
|
||||||
unavailable_vector_ids.insert(next_vector_id);
|
for writer in &writers {
|
||||||
index.vector_id_docid.put(wtxn, &next_vector_id, &docid)?;
|
if !writer.contains_item(wtxn, docid)? {
|
||||||
let mut docid_vector_ids =
|
writer.add_item(wtxn, docid, &vector)?;
|
||||||
index.docid_vector_ids.get(&wtxn, &docid)?.unwrap_or_default();
|
break;
|
||||||
docid_vector_ids.insert(next_vector_id);
|
}
|
||||||
index.docid_vector_ids.put(wtxn, &docid, &docid_vector_ids)?;
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log::debug!("There are {} entries in the arroy so far", unavailable_vector_ids.len());
|
log::debug!("There are 🤷♀️ entries in the arroy so far");
|
||||||
index.put_unavailable_vector_ids(wtxn, unavailable_vector_ids)?;
|
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||||
for (key, (deletion, addition)) in sl_map {
|
for (key, (deletion, addition)) in sl_map {
|
||||||
|
Loading…
Reference in New Issue
Block a user