Compare commits

...

33 Commits

Author SHA1 Message Date
Clément Renault
4ea62f9235
Improve geo error handling 2024-11-13 14:15:42 +01:00
ManyTheFish
b311585280 Fix facet strings 2024-11-13 13:50:10 +01:00
Clément Renault
2f8ba6944f
Only register the _geo field 2024-11-13 12:05:27 +01:00
Clément Renault
379cedb096
Make the geo visible to the facet extractor 2024-11-13 11:47:58 +01:00
ManyTheFish
94cedd22a5
Add linear facet databases 2024-11-13 11:47:58 +01:00
Clément Renault
c78f931eb5
Fix some geo issues 2024-11-13 11:47:58 +01:00
Clément Renault
931c93d334
WIP 2024-11-13 11:47:30 +01:00
Clément Renault
2dd729ff43
Accept null as a valid geo value 2024-11-13 11:47:30 +01:00
Clément Renault
19864bcff3
Extract and pack the points to inserted them in the rtree 2024-11-13 11:47:30 +01:00
Clément Renault
f9e5a06699
Always collect the geopoint even when deleting one 2024-11-13 11:47:30 +01:00
Clément Renault
408a7cec6e
Finialize the GeoExtractor 2024-11-13 11:45:19 +01:00
Clément Renault
717a69dc6e
WIP 2024-11-13 11:45:18 +01:00
Clément Renault
bf88409075
WIP 2024-11-13 11:45:18 +01:00
Louis Dureuil
a01bc7b454
Fix error_document_field_limit_reached_in_one_document test 2024-11-13 10:34:54 +01:00
Louis Dureuil
7accfea624
Don't short circuit when we encounter a semantic error while extracting fields and external docid 2024-11-13 10:33:59 +01:00
Louis Dureuil
82dcaba6ca
Fix test: somehow on main vectors where displayed even though retrieveVectors: false 2024-11-12 23:58:25 +01:00
Louis Dureuil
cb1d6613dd
Adjust snapshots 2024-11-12 23:26:30 +01:00
Louis Dureuil
3b0cb5b487
Fix vector error messages 2024-11-12 23:26:16 +01:00
Louis Dureuil
bfdcd1cf33
Space changes 2024-11-12 22:52:45 +01:00
Louis Dureuil
1d13e804f7
Adjust test snapshots 2024-11-12 22:52:41 +01:00
Louis Dureuil
c4e9f761e9
Emit better error messages when parsing vectors 2024-11-12 22:49:22 +01:00
Louis Dureuil
8a6e61c77f
InvalidVectorsEmbedderConf error takes a String rather than a deserr error 2024-11-12 22:47:57 +01:00
Louis Dureuil
68bbf674c9
Make REST mock thread independent 2024-11-12 16:31:31 +01:00
Louis Dureuil
980921e078
Vector fixes 2024-11-12 16:31:22 +01:00
Louis Dureuil
1fcd5f091e
Remove progress from task 2024-11-12 12:23:13 +01:00
Louis Dureuil
6094bb299a
Fix user_provided vectors 2024-11-12 10:15:55 +01:00
Louis Dureuil
bef8fc6cf1
Fix hf embedder 2024-11-08 13:10:17 +01:00
Louis Dureuil
e32677999f
Adapt some snapshots 2024-11-08 00:06:33 +01:00
Louis Dureuil
5185aa21b8
Know if your vectors are implicit when writing them back in documents + don't write empty _vectors 2024-11-08 00:05:36 +01:00
Louis Dureuil
8a314ab81d
Fix primary key fid order 2024-11-08 00:05:12 +01:00
Louis Dureuil
4706a0eb49
Fix vector parsing 2024-11-07 23:26:20 +01:00
Louis Dureuil
d97af4d8e6
fix field order of JSON documents 2024-11-07 22:36:52 +01:00
ManyTheFish
1f5d801271 Fix crashes in facet search indexing 2024-11-07 17:22:30 +01:00
48 changed files with 1624 additions and 461 deletions

13
Cargo.lock generated
View File

@ -2623,6 +2623,7 @@ dependencies = [
"meilisearch-types", "meilisearch-types",
"memmap2", "memmap2",
"page_size", "page_size",
"raw-collections",
"rayon", "rayon",
"roaring", "roaring",
"serde", "serde",
@ -3538,6 +3539,7 @@ version = "1.11.0"
dependencies = [ dependencies = [
"actix-web", "actix-web",
"anyhow", "anyhow",
"bumpalo",
"convert_case 0.6.0", "convert_case 0.6.0",
"csv", "csv",
"deserr", "deserr",
@ -3550,6 +3552,7 @@ dependencies = [
"meili-snap", "meili-snap",
"memmap2", "memmap2",
"milli", "milli",
"raw-collections",
"roaring", "roaring",
"serde", "serde",
"serde-cs", "serde-cs",
@ -3661,6 +3664,7 @@ dependencies = [
"time", "time",
"tokenizers", "tokenizers",
"tracing", "tracing",
"uell",
"ureq", "ureq",
"url", "url",
"uuid", "uuid",
@ -5789,6 +5793,15 @@ version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9" checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9"
[[package]]
name = "uell"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40de5982e28612e20330e77d81f1559b74f66caf3c7fc10b19ada4843f4b4fd7"
dependencies = [
"bumpalo",
]
[[package]] [[package]]
name = "unescaper" name = "unescaper"
version = "0.1.5" version = "0.1.5"

View File

@ -22,6 +22,7 @@ flate2 = "1.0.30"
meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" } meilisearch-types = { path = "../meilisearch-types" }
page_size = "0.6.0" page_size = "0.6.0"
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
rayon = "1.10.0" rayon = "1.10.0"
roaring = { version = "0.10.6", features = ["serde"] } roaring = { version = "0.10.6", features = ["serde"] }
serde = { version = "1.0.204", features = ["derive"] } serde = { version = "1.0.204", features = ["derive"] }

View File

@ -43,6 +43,7 @@ use meilisearch_types::milli::{self, Filter};
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task};
use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME};
use raw_collections::RawMap;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use time::macros::format_description; use time::macros::format_description;
use time::OffsetDateTime; use time::OffsetDateTime;
@ -1318,7 +1319,12 @@ impl IndexScheduler {
index, index,
&mut new_fields_ids_map, &mut new_fields_ids_map,
primary_key.as_deref(), primary_key.as_deref(),
first_document.as_ref(), first_document
.map(|raw| RawMap::from_raw_value(raw, &indexer_alloc))
.transpose()
.map_err(|error| {
milli::Error::UserError(milli::UserError::SerdeJson(error))
})?,
)? )?
.map_err(milli::Error::from)?; .map_err(milli::Error::from)?;

View File

@ -148,7 +148,6 @@ pub fn snapshot_task(task: &Task) -> String {
enqueued_at: _, enqueued_at: _,
started_at: _, started_at: _,
finished_at: _, finished_at: _,
progress: _,
error, error,
canceled_by, canceled_by,
details, details,

View File

@ -978,12 +978,7 @@ impl IndexScheduler {
Ok(( Ok((
ret.map(|task| { ret.map(|task| {
if processing.contains(task.uid) { if processing.contains(task.uid) {
Task { Task { status: Status::Processing, started_at: Some(started_at), ..task }
status: Status::Processing,
progress: progress.clone(),
started_at: Some(started_at),
..task
}
} else { } else {
task task
} }
@ -1025,7 +1020,6 @@ impl IndexScheduler {
enqueued_at: OffsetDateTime::now_utc(), enqueued_at: OffsetDateTime::now_utc(),
started_at: None, started_at: None,
finished_at: None, finished_at: None,
progress: None,
error: None, error: None,
canceled_by: None, canceled_by: None,
details: kind.default_details(), details: kind.default_details(),
@ -1606,8 +1600,6 @@ impl<'a> Dump<'a> {
enqueued_at: task.enqueued_at, enqueued_at: task.enqueued_at,
started_at: task.started_at, started_at: task.started_at,
finished_at: task.finished_at, finished_at: task.finished_at,
/// FIXME: should we update dump to contain progress information? 🤔
progress: None,
error: task.error, error: task.error,
canceled_by: task.canceled_by, canceled_by: task.canceled_by,
details: task.details, details: task.details,

View File

@ -1,5 +1,5 @@
--- ---
source: index-scheduler/src/lib.rs source: crates/index-scheduler/src/lib.rs
--- ---
### Autobatching Enabled = true ### Autobatching Enabled = true
### Processing Tasks: ### Processing Tasks:
@ -22,7 +22,7 @@ succeeded [0,1,]
doggos [0,1,2,] doggos [0,1,2,]
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Index Mapper: ### Index Mapper:
doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } doggos: { number_of_documents: 1, field_distribution: {"breed": 1, "doggo": 1, "id": 1} }
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Canceled By: ### Canceled By:

View File

@ -1,5 +1,5 @@
--- ---
source: index-scheduler/src/lib.rs source: crates/index-scheduler/src/lib.rs
--- ---
### Autobatching Enabled = true ### Autobatching Enabled = true
### Processing Tasks: ### Processing Tasks:
@ -21,7 +21,7 @@ succeeded [0,1,]
doggos [0,1,] doggos [0,1,]
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Index Mapper: ### Index Mapper:
doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1, "doggo": 1, "id": 1} } doggos: { number_of_documents: 1, field_distribution: {"breed": 1, "doggo": 1, "id": 1} }
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Canceled By: ### Canceled By:

View File

@ -345,8 +345,6 @@ impl IndexScheduler {
enqueued_at, enqueued_at,
started_at, started_at,
finished_at, finished_at,
/// FIXME: assert something here? ask tamo 🤔
progress: _,
error: _, error: _,
canceled_by, canceled_by,
details, details,

View File

@ -13,6 +13,7 @@ license.workspace = true
[dependencies] [dependencies]
actix-web = { version = "4.8.0", default-features = false } actix-web = { version = "4.8.0", default-features = false }
anyhow = "1.0.86" anyhow = "1.0.86"
bumpalo = "3.16.0"
convert_case = "0.6.0" convert_case = "0.6.0"
csv = "1.3.0" csv = "1.3.0"
deserr = { version = "0.6.2", features = ["actix-web"] } deserr = { version = "0.6.2", features = ["actix-web"] }
@ -23,6 +24,7 @@ flate2 = "1.0.30"
fst = "0.4.7" fst = "0.4.7"
memmap2 = "0.9.4" memmap2 = "0.9.4"
milli = { path = "../milli" } milli = { path = "../milli" }
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
roaring = { version = "0.10.6", features = ["serde"] } roaring = { version = "0.10.6", features = ["serde"] }
serde = { version = "1.0.204", features = ["derive"] } serde = { version = "1.0.204", features = ["derive"] }
serde-cs = "0.2.4" serde-cs = "0.2.4"
@ -70,4 +72,3 @@ swedish-recomposition = ["milli/swedish-recomposition"]
german = ["milli/german"] german = ["milli/german"]
# allow turkish normalization # allow turkish normalization
turkish = ["milli/turkish"] turkish = ["milli/turkish"]

View File

@ -3,13 +3,16 @@ use std::fs::File;
use std::io::{self, BufWriter}; use std::io::{self, BufWriter};
use std::marker::PhantomData; use std::marker::PhantomData;
use bumpalo::Bump;
use memmap2::Mmap; use memmap2::Mmap;
use milli::documents::Error; use milli::documents::Error;
use milli::update::new::TopLevelMap; use milli::update::new::TopLevelMap;
use milli::Object; use milli::Object;
use raw_collections::RawMap;
use serde::de::{SeqAccess, Visitor}; use serde::de::{SeqAccess, Visitor};
use serde::{Deserialize, Deserializer}; use serde::{Deserialize, Deserializer};
use serde_json::error::Category; use serde_json::error::Category;
use serde_json::value::RawValue;
use serde_json::{to_writer, Map, Value}; use serde_json::{to_writer, Map, Value};
use crate::error::{Code, ErrorCode}; use crate::error::{Code, ErrorCode};
@ -213,10 +216,15 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
// We memory map to be able to deserailize into a TopLevelMap<'pl> that // We memory map to be able to deserailize into a TopLevelMap<'pl> that
// does not allocate when possible and only materialize the first/top level. // does not allocate when possible and only materialize the first/top level.
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB
let mut out = BufWriter::new(output); let mut out = BufWriter::new(output);
let mut deserializer = serde_json::Deserializer::from_slice(&input); let mut deserializer = serde_json::Deserializer::from_slice(&input);
let count = match array_each(&mut deserializer, |obj: TopLevelMap| to_writer(&mut out, &obj)) { let count = match array_each(&mut deserializer, |obj: &RawValue| {
doc_alloc.reset();
let map = RawMap::from_raw_value(obj, &doc_alloc)?;
to_writer(&mut out, &map)
}) {
// The json data has been deserialized and does not need to be processed again. // The json data has been deserialized and does not need to be processed again.
// The data has been transferred to the writer during the deserialization process. // The data has been transferred to the writer during the deserialization process.
Ok(Ok(count)) => count, Ok(Ok(count)) => count,

View File

@ -4,9 +4,7 @@ use time::{Duration, OffsetDateTime};
use crate::error::ResponseError; use crate::error::ResponseError;
use crate::settings::{Settings, Unchecked}; use crate::settings::{Settings, Unchecked};
use crate::tasks::{ use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId};
serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId, TaskProgress,
};
#[derive(Debug, Clone, PartialEq, Eq, Serialize)] #[derive(Debug, Clone, PartialEq, Eq, Serialize)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
@ -29,8 +27,6 @@ pub struct TaskView {
pub started_at: Option<OffsetDateTime>, pub started_at: Option<OffsetDateTime>,
#[serde(with = "time::serde::rfc3339::option", default)] #[serde(with = "time::serde::rfc3339::option", default)]
pub finished_at: Option<OffsetDateTime>, pub finished_at: Option<OffsetDateTime>,
#[serde(skip_serializing_if = "Option::is_none")]
pub progress: Option<TaskProgress>,
} }
impl TaskView { impl TaskView {
@ -47,7 +43,6 @@ impl TaskView {
enqueued_at: task.enqueued_at, enqueued_at: task.enqueued_at,
started_at: task.started_at, started_at: task.started_at,
finished_at: task.finished_at, finished_at: task.finished_at,
progress: task.progress.clone(),
} }
} }
} }

View File

@ -31,8 +31,6 @@ pub struct Task {
#[serde(with = "time::serde::rfc3339::option")] #[serde(with = "time::serde::rfc3339::option")]
pub finished_at: Option<OffsetDateTime>, pub finished_at: Option<OffsetDateTime>,
pub progress: Option<TaskProgress>,
pub error: Option<ResponseError>, pub error: Option<ResponseError>,
pub canceled_by: Option<TaskId>, pub canceled_by: Option<TaskId>,
pub details: Option<Details>, pub details: Option<Details>,

View File

@ -1335,7 +1335,6 @@ async fn error_add_documents_missing_document_id() {
} }
#[actix_rt::test] #[actix_rt::test]
#[should_panic]
async fn error_document_field_limit_reached_in_one_document() { async fn error_document_field_limit_reached_in_one_document() {
let server = Server::new().await; let server = Server::new().await;
let index = server.index("test"); let index = server.index("test");
@ -1352,7 +1351,7 @@ async fn error_document_field_limit_reached_in_one_document() {
let documents = json!([big_object]); let documents = json!([big_object]);
let (response, code) = index.update_documents(documents, Some("id")).await; let (response, code) = index.update_documents(documents, Some("id")).await;
snapshot!(code, @"500 Internal Server Error"); snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await; let response = index.wait_task(response.uid()).await;
snapshot!(code, @"202 Accepted"); snapshot!(code, @"202 Accepted");
@ -1360,16 +1359,21 @@ async fn error_document_field_limit_reached_in_one_document() {
snapshot!(response, snapshot!(response,
@r###" @r###"
{ {
"uid": 1, "uid": "[uid]",
"indexUid": "test", "indexUid": "test",
"status": "succeeded", "status": "failed",
"type": "documentAdditionOrUpdate", "type": "documentAdditionOrUpdate",
"canceledBy": null, "canceledBy": null,
"details": { "details": {
"receivedDocuments": 1, "receivedDocuments": 1,
"indexedDocuments": 1 "indexedDocuments": 0
},
"error": {
"message": "A document cannot contain more than 65,535 fields.",
"code": "max_fields_limit_exceeded",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#max_fields_limit_exceeded"
}, },
"error": null,
"duration": "[duration]", "duration": "[duration]",
"enqueuedAt": "[date]", "enqueuedAt": "[date]",
"startedAt": "[date]", "startedAt": "[date]",
@ -1660,7 +1664,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "The `_geo` field in the document with the id: `11` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.", "message": "The `_geo` field in the document with the id: `\"11\"` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1697,7 +1701,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not find latitude nor longitude in the document with the id: `11`. Was expecting `_geo.lat` and `_geo.lng` fields.", "message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1734,7 +1738,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not find latitude nor longitude in the document with the id: `11`. Was expecting `_geo.lat` and `_geo.lng` fields.", "message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1771,7 +1775,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not find longitude in the document with the id: `11`. Was expecting a `_geo.lng` field.", "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1808,7 +1812,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not find latitude in the document with the id: `11`. Was expecting a `_geo.lat` field.", "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1845,7 +1849,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not find longitude in the document with the id: `11`. Was expecting a `_geo.lng` field.", "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1882,7 +1886,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not find latitude in the document with the id: `11`. Was expecting a `_geo.lat` field.", "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1919,7 +1923,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not parse latitude nor longitude in the document with the id: `11`. Was expecting finite numbers but instead got `false` and `true`.", "message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `false` and `true`.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1956,7 +1960,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not find longitude in the document with the id: `11`. Was expecting a `_geo.lng` field.", "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1993,7 +1997,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not find latitude in the document with the id: `11`. Was expecting a `_geo.lat` field.", "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2030,7 +2034,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not parse latitude nor longitude in the document with the id: `11`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.", "message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2067,7 +2071,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "The `_geo` field in the document with the id: `11` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.", "message": "The `_geo` field in the document with the id: `\"11\"` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2105,7 +2109,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not parse longitude in the document with the id: `12`. Was expecting a finite number but instead got `null`.", "message": "Could not parse longitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2141,7 +2145,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not parse latitude in the document with the id: `12`. Was expecting a finite number but instead got `null`.", "message": "Could not parse latitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2177,7 +2181,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Could not parse latitude nor longitude in the document with the id: `13`. Was expecting finite numbers but instead got `null` and `null`.", "message": "Could not parse latitude nor longitude in the document with the id: `\"13\"`. Was expecting finite numbers but instead got `null` and `null`.",
"code": "invalid_document_geo_field", "code": "invalid_document_geo_field",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2197,7 +2201,7 @@ async fn add_invalid_geo_and_then_settings() {
let index = server.index("test"); let index = server.index("test");
index.create(Some("id")).await; index.create(Some("id")).await;
// _geo is not an object // _geo is not a correct object
let documents = json!([ let documents = json!([
{ {
"id": "11", "id": "11",
@ -2226,7 +2230,7 @@ async fn add_invalid_geo_and_then_settings() {
} }
"###); "###);
let (ret, code) = index.update_settings(json!({"sortableAttributes": ["_geo"]})).await; let (ret, code) = index.update_settings(json!({ "sortableAttributes": ["_geo"] })).await;
snapshot!(code, @"202 Accepted"); snapshot!(code, @"202 Accepted");
let ret = index.wait_task(ret.uid()).await; let ret = index.wait_task(ret.uid()).await;
snapshot!(ret, @r###" snapshot!(ret, @r###"

View File

@ -750,9 +750,9 @@ async fn test_score_details() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
231.0, 231,
32.0 32
] ]
}, },
"_rankingScoreDetails": { "_rankingScoreDetails": {
@ -1543,9 +1543,9 @@ async fn simple_search_with_strange_synonyms() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
231.0, 231,
32.0 32
] ]
} }
} }
@ -1568,9 +1568,9 @@ async fn simple_search_with_strange_synonyms() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
231.0, 231,
32.0 32
] ]
} }
} }
@ -1593,9 +1593,9 @@ async fn simple_search_with_strange_synonyms() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
231.0, 231,
32.0 32
] ]
} }
} }

View File

@ -113,9 +113,9 @@ async fn simple_search_single_index() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
340.0, 340,
90.0 90
] ]
} }
} }
@ -138,9 +138,9 @@ async fn simple_search_single_index() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
54.0 54
] ]
} }
} }
@ -182,9 +182,9 @@ async fn federation_single_search_single_index() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
340.0, 340,
90.0 90
] ]
}, },
"_federation": { "_federation": {
@ -305,9 +305,9 @@ async fn federation_two_search_single_index() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
340.0, 340,
90.0 90
] ]
}, },
"_federation": { "_federation": {
@ -325,9 +325,9 @@ async fn federation_two_search_single_index() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
54.0 54
] ]
}, },
"_federation": { "_federation": {
@ -480,9 +480,9 @@ async fn simple_search_two_indexes() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
340.0, 340,
90.0 90
] ]
} }
} }
@ -513,9 +513,9 @@ async fn simple_search_two_indexes() {
"cattos": "pésti", "cattos": "pésti",
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
3.0 3
] ]
} }
}, },
@ -535,9 +535,9 @@ async fn simple_search_two_indexes() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
54.0 54
] ]
} }
} }
@ -1393,9 +1393,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() {
"cattos": "pésti", "cattos": "pésti",
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
3.0 3
] ]
}, },
"_federation": { "_federation": {
@ -1414,9 +1414,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
10.0, 10,
23.0, 23,
32.0 32
] ]
}, },
"_federation": { "_federation": {
@ -1442,9 +1442,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
54.0 54
] ]
}, },
"_federation": { "_federation": {
@ -1474,9 +1474,9 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
10.0, 10,
23.0, 23,
32.0 32
] ]
}, },
"_federation": { "_federation": {
@ -1716,9 +1716,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() {
"cattos": "pésti", "cattos": "pésti",
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
3.0 3
] ]
}, },
"_federation": { "_federation": {
@ -1748,9 +1748,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
10.0, 10,
23.0, 23,
32.0 32
] ]
}, },
"_federation": { "_federation": {
@ -1769,9 +1769,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
10.0, 10,
23.0, 23,
32.0 32
] ]
}, },
"_federation": { "_federation": {
@ -1797,9 +1797,9 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
54.0 54
] ]
}, },
"_federation": { "_federation": {
@ -2103,9 +2103,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
54.0 54
] ]
}, },
"_federation": { "_federation": {
@ -2124,9 +2124,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
10.0, 10,
-23.0, -23,
32.0 32
] ]
}, },
"_federation": { "_federation": {
@ -2145,9 +2145,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
340.0, 340,
90.0 90
] ]
}, },
"_federation": { "_federation": {
@ -2166,9 +2166,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
231.0, 231,
32.0 32
] ]
}, },
"_federation": { "_federation": {
@ -2187,9 +2187,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
3.0 3
] ]
}, },
"_federation": { "_federation": {
@ -2228,9 +2228,9 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
54.0 54
] ]
}, },
"_federation": { "_federation": {
@ -2415,9 +2415,9 @@ async fn federation_sort_different_ranking_rules() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
54.0 54
] ]
}, },
"_federation": { "_federation": {
@ -2436,9 +2436,9 @@ async fn federation_sort_different_ranking_rules() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
10.0, 10,
-23.0, -23,
32.0 32
] ]
}, },
"_federation": { "_federation": {
@ -2457,9 +2457,9 @@ async fn federation_sort_different_ranking_rules() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
340.0, 340,
90.0 90
] ]
}, },
"_federation": { "_federation": {
@ -2478,9 +2478,9 @@ async fn federation_sort_different_ranking_rules() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
231.0, 231,
32.0 32
] ]
}, },
"_federation": { "_federation": {
@ -2499,9 +2499,9 @@ async fn federation_sort_different_ranking_rules() {
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
3.0 3
] ]
}, },
"_federation": { "_federation": {
@ -2716,9 +2716,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction()
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
54.0 54
] ]
}, },
"_federation": { "_federation": {
@ -2757,9 +2757,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction()
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
10.0, 10,
-23.0, -23,
32.0 32
] ]
}, },
"_federation": { "_federation": {
@ -2778,9 +2778,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction()
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
340.0, 340,
90.0 90
] ]
}, },
"_federation": { "_federation": {
@ -2799,9 +2799,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction()
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
-100.0, -100,
231.0, 231,
32.0 32
] ]
}, },
"_federation": { "_federation": {
@ -2820,9 +2820,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction()
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
3.0 3
] ]
}, },
"_federation": { "_federation": {
@ -2881,9 +2881,9 @@ async fn federation_sort_different_indexes_different_criterion_same_direction()
], ],
"_vectors": { "_vectors": {
"manual": [ "manual": [
1.0, 1,
2.0, 2,
54.0 54
] ]
}, },
"_federation": { "_federation": {
@ -4346,10 +4346,10 @@ async fn federation_vector_two_indexes() {
let (response, code) = server let (response, code) = server
.multi_search(json!({"federation": {}, "queries": [ .multi_search(json!({"federation": {}, "queries": [
{"indexUid" : "vectors-animal", "vector": [1.0, 0.0, 0.5], "hybrid": {"semanticRatio": 1.0, "embedder": "animal"}}, {"indexUid" : "vectors-animal", "vector": [1.0, 0.0, 0.5], "hybrid": {"semanticRatio": 1.0, "embedder": "animal"}, "retrieveVectors": true},
// joyful and energetic first // joyful and energetic first
{"indexUid": "vectors-sentiment", "vector": [0.8, 0.6], "hybrid": {"semanticRatio": 1.0, "embedder": "sentiment"}}, {"indexUid": "vectors-sentiment", "vector": [0.8, 0.6], "hybrid": {"semanticRatio": 1.0, "embedder": "sentiment"}, "retrieveVectors": true},
{"indexUid": "vectors-sentiment", "q": "dog"}, {"indexUid": "vectors-sentiment", "q": "dog", "retrieveVectors": true},
]})) ]}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
@ -4364,7 +4364,16 @@ async fn federation_vector_two_indexes() {
0.8, 0.8,
0.09, 0.09,
0.8 0.8
] ],
"sentiment": {
"embeddings": [
[
0.800000011920929,
0.30000001192092896
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-sentiment", "indexUid": "vectors-sentiment",
@ -4379,7 +4388,17 @@ async fn federation_vector_two_indexes() {
"sentiment": [ "sentiment": [
0.8, 0.8,
0.3 0.3
] ],
"animal": {
"embeddings": [
[
0.800000011920929,
0.09000000357627869,
0.800000011920929
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-animal", "indexUid": "vectors-animal",
@ -4394,7 +4413,17 @@ async fn federation_vector_two_indexes() {
"sentiment": [ "sentiment": [
-1.0, -1.0,
0.1 0.1
] ],
"animal": {
"embeddings": [
[
0.8500000238418579,
0.019999999552965164,
0.10000000149011612
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-animal", "indexUid": "vectors-animal",
@ -4410,7 +4439,16 @@ async fn federation_vector_two_indexes() {
0.9, 0.9,
0.8, 0.8,
0.05 0.05
] ],
"sentiment": {
"embeddings": [
[
-0.10000000149011612,
0.550000011920929
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-sentiment", "indexUid": "vectors-sentiment",
@ -4426,7 +4464,16 @@ async fn federation_vector_two_indexes() {
0.85, 0.85,
0.02, 0.02,
0.1 0.1
] ],
"sentiment": {
"embeddings": [
[
-1.0,
0.10000000149011612
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-sentiment", "indexUid": "vectors-sentiment",
@ -4441,7 +4488,17 @@ async fn federation_vector_two_indexes() {
"sentiment": [ "sentiment": [
-0.2, -0.2,
0.65 0.65
] ],
"animal": {
"embeddings": [
[
0.800000011920929,
0.8999999761581421,
0.5
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-animal", "indexUid": "vectors-animal",
@ -4456,7 +4513,17 @@ async fn federation_vector_two_indexes() {
"sentiment": [ "sentiment": [
-0.1, -0.1,
0.55 0.55
] ],
"animal": {
"embeddings": [
[
0.8999999761581421,
0.800000011920929,
0.05000000074505806
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-animal", "indexUid": "vectors-animal",
@ -4472,7 +4539,16 @@ async fn federation_vector_two_indexes() {
0.8, 0.8,
0.9, 0.9,
0.5 0.5
] ],
"sentiment": {
"embeddings": [
[
-0.20000000298023224,
0.6499999761581421
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-sentiment", "indexUid": "vectors-sentiment",
@ -4492,8 +4568,8 @@ async fn federation_vector_two_indexes() {
// hybrid search, distinct embedder // hybrid search, distinct embedder
let (response, code) = server let (response, code) = server
.multi_search(json!({"federation": {}, "queries": [ .multi_search(json!({"federation": {}, "queries": [
{"indexUid" : "vectors-animal", "vector": [1.0, 0.0, 0.5], "hybrid": {"semanticRatio": 1.0, "embedder": "animal"}, "showRankingScore": true}, {"indexUid" : "vectors-animal", "vector": [1.0, 0.0, 0.5], "hybrid": {"semanticRatio": 1.0, "embedder": "animal"}, "showRankingScore": true, "retrieveVectors": true},
{"indexUid": "vectors-sentiment", "vector": [-1, 0.6], "q": "beagle", "hybrid": {"semanticRatio": 1.0, "embedder": "sentiment"}, "showRankingScore": true}, {"indexUid": "vectors-sentiment", "vector": [-1, 0.6], "q": "beagle", "hybrid": {"semanticRatio": 1.0, "embedder": "sentiment"}, "showRankingScore": true, "retrieveVectors": true,},
]})) ]}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
@ -4507,7 +4583,17 @@ async fn federation_vector_two_indexes() {
"sentiment": [ "sentiment": [
0.8, 0.8,
0.3 0.3
] ],
"animal": {
"embeddings": [
[
0.800000011920929,
0.09000000357627869,
0.800000011920929
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-animal", "indexUid": "vectors-animal",
@ -4523,7 +4609,17 @@ async fn federation_vector_two_indexes() {
"sentiment": [ "sentiment": [
-1.0, -1.0,
0.1 0.1
] ],
"animal": {
"embeddings": [
[
0.8500000238418579,
0.019999999552965164,
0.10000000149011612
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-animal", "indexUid": "vectors-animal",
@ -4540,7 +4636,16 @@ async fn federation_vector_two_indexes() {
0.85, 0.85,
0.02, 0.02,
0.1 0.1
] ],
"sentiment": {
"embeddings": [
[
-1.0,
0.10000000149011612
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-sentiment", "indexUid": "vectors-sentiment",
@ -4557,7 +4662,16 @@ async fn federation_vector_two_indexes() {
0.8, 0.8,
0.9, 0.9,
0.5 0.5
] ],
"sentiment": {
"embeddings": [
[
-0.20000000298023224,
0.6499999761581421
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-sentiment", "indexUid": "vectors-sentiment",
@ -4573,7 +4687,17 @@ async fn federation_vector_two_indexes() {
"sentiment": [ "sentiment": [
-0.2, -0.2,
0.65 0.65
] ],
"animal": {
"embeddings": [
[
0.800000011920929,
0.8999999761581421,
0.5
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-animal", "indexUid": "vectors-animal",
@ -4589,7 +4713,17 @@ async fn federation_vector_two_indexes() {
"sentiment": [ "sentiment": [
-0.1, -0.1,
0.55 0.55
] ],
"animal": {
"embeddings": [
[
0.8999999761581421,
0.800000011920929,
0.05000000074505806
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-animal", "indexUid": "vectors-animal",
@ -4606,7 +4740,16 @@ async fn federation_vector_two_indexes() {
0.9, 0.9,
0.8, 0.8,
0.05 0.05
] ],
"sentiment": {
"embeddings": [
[
-0.10000000149011612,
0.550000011920929
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-sentiment", "indexUid": "vectors-sentiment",
@ -4623,7 +4766,16 @@ async fn federation_vector_two_indexes() {
0.8, 0.8,
0.09, 0.09,
0.8 0.8
] ],
"sentiment": {
"embeddings": [
[
0.800000011920929,
0.30000001192092896
]
],
"regenerate": false
}
}, },
"_federation": { "_federation": {
"indexUid": "vectors-sentiment", "indexUid": "vectors-sentiment",

View File

@ -249,7 +249,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`", "message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`",
"code": "invalid_vectors_type", "code": "invalid_vectors_type",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type" "link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -278,7 +278,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`", "message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`",
"code": "invalid_vectors_type", "code": "invalid_vectors_type",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type" "link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -308,7 +308,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.regenerate`: expected a boolean, but found a string: `\"yes please\"`", "message": "Bad embedder configuration in the document with id: `0`. Could not parse `._vectors.manual.regenerate`: invalid type: string \"yes please\", expected a boolean at line 1 column 26",
"code": "invalid_vectors_type", "code": "invalid_vectors_type",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type" "link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -320,8 +320,7 @@ async fn user_provided_embeddings_error() {
} }
"###); "###);
let documents = let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true, "regenerate": true }}});
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true }}});
let (value, code) = index.add_documents(documents, None).await; let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted"); snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await; let task = index.wait_task(value.uid()).await;
@ -337,7 +336,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings`: expected null or an array, but found a boolean: `true`", "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings`: expected null or an array, but found a boolean: `true`",
"code": "invalid_vectors_type", "code": "invalid_vectors_type",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type" "link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -349,8 +348,7 @@ async fn user_provided_embeddings_error() {
} }
"###); "###);
let documents = let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true], "regenerate": true }}});
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true] }}});
let (value, code) = index.add_documents(documents, None).await; let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted"); snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await; let task = index.wait_task(value.uid()).await;
@ -366,7 +364,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`",
"code": "invalid_vectors_type", "code": "invalid_vectors_type",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type" "link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -378,8 +376,7 @@ async fn user_provided_embeddings_error() {
} }
"###); "###);
let documents = let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]], "regenerate": false }}});
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]] }}});
let (value, code) = index.add_documents(documents, None).await; let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted"); snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await; let task = index.wait_task(value.uid()).await;
@ -395,7 +392,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`",
"code": "invalid_vectors_type", "code": "invalid_vectors_type",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type" "link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -436,7 +433,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`",
"code": "invalid_vectors_type", "code": "invalid_vectors_type",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type" "link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -464,7 +461,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected an array, but found a number: `0.3`", "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected an array, but found a number: `0.3`",
"code": "invalid_vectors_type", "code": "invalid_vectors_type",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type" "link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -492,7 +489,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`",
"code": "invalid_vectors_type", "code": "invalid_vectors_type",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type" "link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -532,7 +529,7 @@ async fn user_provided_vectors_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "While embedding documents for embedder `manual`: no vectors provided for document \"40\" and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", "message": "While embedding documents for embedder `manual`: no vectors provided for document `40` and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`",
"code": "vector_embedding_error", "code": "vector_embedding_error",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error" "link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -561,7 +558,7 @@ async fn user_provided_vectors_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "While embedding documents for embedder `manual`: no vectors provided for document \"42\"\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", "message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).",
"code": "vector_embedding_error", "code": "vector_embedding_error",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error" "link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -590,7 +587,7 @@ async fn user_provided_vectors_error() {
"indexedDocuments": 0 "indexedDocuments": 0
}, },
"error": { "error": {
"message": "While embedding documents for embedder `manual`: no vectors provided for document \"42\"\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", "message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).",
"code": "vector_embedding_error", "code": "vector_embedding_error",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error" "link": "https://docs.meilisearch.com/errors#vector_embedding_error"

View File

@ -1,5 +1,4 @@
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::sync::atomic::{AtomicUsize, Ordering};
use meili_snap::{json_string, snapshot}; use meili_snap::{json_string, snapshot};
use reqwest::IntoUrl; use reqwest::IntoUrl;
@ -13,13 +12,22 @@ use crate::vector::{get_server_vector, GetAllDocumentsOptions};
async fn create_mock() -> (MockServer, Value) { async fn create_mock() -> (MockServer, Value) {
let mock_server = MockServer::start().await; let mock_server = MockServer::start().await;
let counter = AtomicUsize::new(0); let text_to_embedding: BTreeMap<_, _> = vec![
// text -> embedding
("kefir", [0.0, 0.0, 0.0]),
("intel", [1.0, 1.0, 1.0]),
]
// turn into btree
.into_iter()
.collect();
Mock::given(method("POST")) Mock::given(method("POST"))
.and(path("/")) .and(path("/"))
.respond_with(move |_req: &Request| { .respond_with(move |req: &Request| {
let counter = counter.fetch_add(1, Ordering::Relaxed); let text: String = req.body_json().unwrap();
ResponseTemplate::new(200).set_body_json(json!({ "data": vec![counter; 3] })) ResponseTemplate::new(200).set_body_json(
json!({ "data": text_to_embedding.get(text.as_str()).unwrap_or(&[99., 99., 99.]) }),
)
}) })
.mount(&mock_server) .mount(&mock_server)
.await; .await;
@ -32,13 +40,14 @@ async fn create_mock() -> (MockServer, Value) {
"request": "{{text}}", "request": "{{text}}",
"response": { "response": {
"data": "{{embedding}}" "data": "{{embedding}}"
} },
"documentTemplate": "{{doc.name}}",
}); });
(mock_server, embedder_settings) (mock_server, embedder_settings)
} }
async fn create_mock_map() -> (MockServer, Value) { async fn create_mock_default_template() -> (MockServer, Value) {
let mock_server = MockServer::start().await; let mock_server = MockServer::start().await;
let text_to_embedding: BTreeMap<_, _> = vec![ let text_to_embedding: BTreeMap<_, _> = vec![
@ -97,7 +106,14 @@ struct SingleResponse {
async fn create_mock_multiple() -> (MockServer, Value) { async fn create_mock_multiple() -> (MockServer, Value) {
let mock_server = MockServer::start().await; let mock_server = MockServer::start().await;
let counter = AtomicUsize::new(0); let text_to_embedding: BTreeMap<_, _> = vec![
// text -> embedding
("kefir", [0.0, 0.0, 0.0]),
("intel", [1.0, 1.0, 1.0]),
]
// turn into btree
.into_iter()
.collect();
Mock::given(method("POST")) Mock::given(method("POST"))
.and(path("/")) .and(path("/"))
@ -115,8 +131,11 @@ async fn create_mock_multiple() -> (MockServer, Value) {
.input .input
.into_iter() .into_iter()
.map(|text| SingleResponse { .map(|text| SingleResponse {
embedding: text_to_embedding
.get(text.as_str())
.unwrap_or(&[99., 99., 99.])
.to_vec(),
text, text,
embedding: vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3],
}) })
.collect(); .collect();
@ -142,7 +161,8 @@ async fn create_mock_multiple() -> (MockServer, Value) {
}, },
"{{..}}" "{{..}}"
] ]
} },
"documentTemplate": "{{doc.name}}"
}); });
(mock_server, embedder_settings) (mock_server, embedder_settings)
@ -156,7 +176,14 @@ struct SingleRequest {
async fn create_mock_single_response_in_array() -> (MockServer, Value) { async fn create_mock_single_response_in_array() -> (MockServer, Value) {
let mock_server = MockServer::start().await; let mock_server = MockServer::start().await;
let counter = AtomicUsize::new(0); let text_to_embedding: BTreeMap<_, _> = vec![
// text -> embedding
("kefir", [0.0, 0.0, 0.0]),
("intel", [1.0, 1.0, 1.0]),
]
// turn into btree
.into_iter()
.collect();
Mock::given(method("POST")) Mock::given(method("POST"))
.and(path("/")) .and(path("/"))
@ -171,8 +198,11 @@ async fn create_mock_single_response_in_array() -> (MockServer, Value) {
}; };
let output = vec![SingleResponse { let output = vec![SingleResponse {
embedding: text_to_embedding
.get(req.input.as_str())
.unwrap_or(&[99., 99., 99.])
.to_vec(),
text: req.input, text: req.input,
embedding: vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3],
}]; }];
let response = MultipleResponse { output }; let response = MultipleResponse { output };
@ -196,7 +226,8 @@ async fn create_mock_single_response_in_array() -> (MockServer, Value) {
"embedding": "{{embedding}}" "embedding": "{{embedding}}"
} }
] ]
} },
"documentTemplate": "{{doc.name}}"
}); });
(mock_server, embedder_settings) (mock_server, embedder_settings)
@ -205,7 +236,14 @@ async fn create_mock_single_response_in_array() -> (MockServer, Value) {
async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { async fn create_mock_raw_with_custom_header() -> (MockServer, Value) {
let mock_server = MockServer::start().await; let mock_server = MockServer::start().await;
let counter = AtomicUsize::new(0); let text_to_embedding: BTreeMap<_, _> = vec![
// text -> embedding
("kefir", [0.0, 0.0, 0.0]),
("intel", [1.0, 1.0, 1.0]),
]
// turn into btree
.into_iter()
.collect();
Mock::given(method("POST")) Mock::given(method("POST"))
.and(path("/")) .and(path("/"))
@ -223,7 +261,7 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) {
} }
} }
let _req: String = match req.body_json() { let req: String = match req.body_json() {
Ok(req) => req, Ok(req) => req,
Err(error) => { Err(error) => {
return ResponseTemplate::new(400).set_body_json(json!({ return ResponseTemplate::new(400).set_body_json(json!({
@ -232,7 +270,7 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) {
} }
}; };
let output = vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3]; let output = text_to_embedding.get(req.as_str()).unwrap_or(&[99., 99., 99.]).to_vec();
ResponseTemplate::new(200).set_body_json(output) ResponseTemplate::new(200).set_body_json(output)
}) })
@ -245,7 +283,8 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) {
"url": url, "url": url,
"request": "{{text}}", "request": "{{text}}",
"response": "{{embedding}}", "response": "{{embedding}}",
"headers": {"my-nonstandard-auth": "bearer of the ring"} "headers": {"my-nonstandard-auth": "bearer of the ring"},
"documentTemplate": "{{doc.name}}"
}); });
(mock_server, embedder_settings) (mock_server, embedder_settings)
@ -254,12 +293,19 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) {
async fn create_mock_raw() -> (MockServer, Value) { async fn create_mock_raw() -> (MockServer, Value) {
let mock_server = MockServer::start().await; let mock_server = MockServer::start().await;
let counter = AtomicUsize::new(0); let text_to_embedding: BTreeMap<_, _> = vec![
// text -> embedding
("kefir", [0.0, 0.0, 0.0]),
("intel", [1.0, 1.0, 1.0]),
]
// turn into btree
.into_iter()
.collect();
Mock::given(method("POST")) Mock::given(method("POST"))
.and(path("/")) .and(path("/"))
.respond_with(move |req: &Request| { .respond_with(move |req: &Request| {
let _req: String = match req.body_json() { let req: String = match req.body_json() {
Ok(req) => req, Ok(req) => req,
Err(error) => { Err(error) => {
return ResponseTemplate::new(400).set_body_json(json!({ return ResponseTemplate::new(400).set_body_json(json!({
@ -268,7 +314,7 @@ async fn create_mock_raw() -> (MockServer, Value) {
} }
}; };
let output = vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3]; let output = text_to_embedding.get(req.as_str()).unwrap_or(&[99., 99., 99.]).to_vec();
ResponseTemplate::new(200).set_body_json(output) ResponseTemplate::new(200).set_body_json(output)
}) })
@ -281,29 +327,30 @@ async fn create_mock_raw() -> (MockServer, Value) {
"url": url, "url": url,
"dimensions": 3, "dimensions": 3,
"request": "{{text}}", "request": "{{text}}",
"response": "{{embedding}}" "response": "{{embedding}}",
"documentTemplate": "{{doc.name}}"
}); });
(mock_server, embedder_settings) (mock_server, embedder_settings)
} }
pub async fn post<T: IntoUrl>(url: T) -> reqwest::Result<reqwest::Response> { pub async fn post<T: IntoUrl>(url: T, text: &str) -> reqwest::Result<reqwest::Response> {
reqwest::Client::builder().build()?.post(url).send().await reqwest::Client::builder().build()?.post(url).json(&json!(text)).send().await
} }
#[actix_rt::test] #[actix_rt::test]
async fn dummy_testing_the_mock() { async fn dummy_testing_the_mock() {
let (mock, _setting) = create_mock().await; let (mock, _setting) = create_mock().await;
let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); let body = post(&mock.uri(), "kefir").await.unwrap().text().await.unwrap();
snapshot!(body, @r###"{"data":[0,0,0]}"###); snapshot!(body, @r###"{"data":[0.0,0.0,0.0]}"###);
let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); let body = post(&mock.uri(), "intel").await.unwrap().text().await.unwrap();
snapshot!(body, @r###"{"data":[1,1,1]}"###); snapshot!(body, @r###"{"data":[1.0,1.0,1.0]}"###);
let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); let body = post(&mock.uri(), "kefir").await.unwrap().text().await.unwrap();
snapshot!(body, @r###"{"data":[2,2,2]}"###); snapshot!(body, @r###"{"data":[0.0,0.0,0.0]}"###);
let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); let body = post(&mock.uri(), "kefir").await.unwrap().text().await.unwrap();
snapshot!(body, @r###"{"data":[3,3,3]}"###); snapshot!(body, @r###"{"data":[0.0,0.0,0.0]}"###);
let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); let body = post(&mock.uri(), "intel").await.unwrap().text().await.unwrap();
snapshot!(body, @r###"{"data":[4,4,4]}"###); snapshot!(body, @r###"{"data":[1.0,1.0,1.0]}"###);
} }
#[actix_rt::test] #[actix_rt::test]
@ -953,7 +1000,7 @@ async fn bad_settings() {
let (response, code) = index let (response, code) = index
.update_settings(json!({ .update_settings(json!({
"embedders": { "embedders": {
"rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": { "data": "{{embedding}}" }, "dimensions": 2 }), "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": { "data": "{{embedding}}" }, "dimensions": 2, "documentTemplate": "{{doc.name}}" }),
}, },
})) }))
.await; .await;
@ -1920,6 +1967,7 @@ async fn server_custom_header() {
"embedders": { "embedders": {
"rest": { "rest": {
"source": "rest", "source": "rest",
"documentTemplate": "{{doc.name}}",
"url": "[url]", "url": "[url]",
"request": "{{text}}", "request": "{{text}}",
"response": "{{embedding}}", "response": "{{embedding}}",
@ -1940,7 +1988,7 @@ async fn server_custom_header() {
#[actix_rt::test] #[actix_rt::test]
async fn searchable_reindex() { async fn searchable_reindex() {
let (_mock, setting) = create_mock_map().await; let (_mock, setting) = create_mock_default_template().await;
let server = get_server_vector().await; let server = get_server_vector().await;
let index = server.index("doggo"); let index = server.index("doggo");

View File

@ -100,6 +100,7 @@ bumpalo = "3.16.0"
thread_local = "1.1.8" thread_local = "1.1.8"
allocator-api2 = "0.2.18" allocator-api2 = "0.2.18"
rustc-hash = "2.0.0" rustc-hash = "2.0.0"
uell = "0.1.0"
[dev-dependencies] [dev-dependencies]
mimalloc = { version = "0.1.43", default-features = false } mimalloc = { version = "0.1.43", default-features = false }

View File

@ -122,7 +122,7 @@ and can not be more than 512 bytes.", .document_id.to_string()
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
InvalidVectorsMapType { document_id: String, value: Value }, InvalidVectorsMapType { document_id: String, value: Value },
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")] #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError }, InvalidVectorsEmbedderConf { document_id: String, error: String },
#[error("{0}")] #[error("{0}")]
InvalidFilter(String), InvalidFilter(String),
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))] #[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]

View File

@ -27,17 +27,34 @@ impl heed::BytesEncode<'_> for OrderedF64Codec {
fn bytes_encode(f: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> { fn bytes_encode(f: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
let mut buffer = [0u8; 16]; let mut buffer = [0u8; 16];
// write the globally ordered float encode_f64_into_ordered_bytes(*f, &mut buffer)?;
let bytes = f64_into_bytes(*f).ok_or(InvalidGloballyOrderedFloatError { float: *f })?;
buffer[..8].copy_from_slice(&bytes[..]);
// Then the f64 value just to be able to read it back
let bytes = f.to_be_bytes();
buffer[8..16].copy_from_slice(&bytes[..]);
Ok(Cow::Owned(buffer.to_vec())) Ok(Cow::Owned(buffer.to_vec()))
} }
} }
impl OrderedF64Codec {
pub fn serialize_into(
f: f64,
buffer: &mut [u8; 16],
) -> Result<(), InvalidGloballyOrderedFloatError> {
encode_f64_into_ordered_bytes(f, buffer)
}
}
fn encode_f64_into_ordered_bytes(
f: f64,
buffer: &mut [u8; 16],
) -> Result<(), InvalidGloballyOrderedFloatError> {
let bytes = f64_into_bytes(f).ok_or(InvalidGloballyOrderedFloatError { float: f })?;
buffer[..8].copy_from_slice(&bytes[..]);
// Then the f64 value just to be able to read it back
let bytes = f.to_be_bytes();
buffer[8..16].copy_from_slice(&bytes[..]);
Ok(())
}
#[derive(Error, Debug)] #[derive(Error, Debug)]
#[error("the float {float} cannot be converted to a globally ordered representation")] #[error("the float {float} cannot be converted to a globally ordered representation")]
pub struct InvalidGloballyOrderedFloatError { pub struct InvalidGloballyOrderedFloatError {

View File

@ -737,7 +737,7 @@ pub(crate) fn write_typed_chunk_into_index(
} }
/// Converts the latitude and longitude back to an xyz GeoPoint. /// Converts the latitude and longitude back to an xyz GeoPoint.
fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { pub fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap(); let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap(); let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];

View File

@ -2,12 +2,16 @@ use std::marker::PhantomData;
use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::atomic::{AtomicUsize, Ordering};
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
use hashbrown::HashMap;
use heed::types::Bytes; use heed::types::Bytes;
use heed::BytesDecode;
use memmap2::Mmap;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::extract::FacetKind; use super::extract::FacetKind;
use super::StdResult; use super::StdResult;
use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec};
use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY};
use crate::index::IndexEmbeddingConfig;
use crate::update::new::KvReaderFieldId; use crate::update::new::KvReaderFieldId;
use crate::vector::Embedding; use crate::vector::Embedding;
use crate::{DocumentId, Index}; use crate::{DocumentId, Index};
@ -26,9 +30,9 @@ pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver)
) )
} }
pub struct KeyValueEntry { pub enum KeyValueEntry {
pub key_length: usize, Small { key_length: usize, data: Box<[u8]> },
pub data: Box<[u8]>, Large { key_entry: KeyEntry, data: Mmap },
} }
impl KeyValueEntry { impl KeyValueEntry {
@ -36,14 +40,25 @@ impl KeyValueEntry {
let mut data = Vec::with_capacity(key.len() + value.len()); let mut data = Vec::with_capacity(key.len() + value.len());
data.extend_from_slice(key); data.extend_from_slice(key);
data.extend_from_slice(value); data.extend_from_slice(value);
KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() } KeyValueEntry::Small { key_length: key.len(), data: data.into_boxed_slice() }
} }
fn from_large_key_value(key: &[u8], value: Mmap) -> Self {
KeyValueEntry::Large { key_entry: KeyEntry::from_key(key), data: value }
}
pub fn key(&self) -> &[u8] { pub fn key(&self) -> &[u8] {
&self.data[..self.key_length] match self {
KeyValueEntry::Small { key_length, data } => &data[..*key_length],
KeyValueEntry::Large { key_entry, data: _ } => key_entry.entry(),
}
} }
pub fn value(&self) -> &[u8] { pub fn value(&self) -> &[u8] {
&self.data[self.key_length..] match self {
KeyValueEntry::Small { key_length, data } => &data[*key_length..],
KeyValueEntry::Large { key_entry: _, data } => &data[..],
}
} }
} }
@ -87,7 +102,7 @@ pub enum ArroyOperation {
embedding: Embedding, embedding: Embedding,
}, },
Finish { Finish {
user_provided: HashMap<String, RoaringBitmap>, configs: Vec<IndexEmbeddingConfig>,
}, },
} }
@ -98,6 +113,7 @@ pub struct DbOperation {
#[derive(Debug)] #[derive(Debug)]
pub enum Database { pub enum Database {
Main,
Documents, Documents,
ExternalDocumentsIds, ExternalDocumentsIds,
ExactWordDocids, ExactWordDocids,
@ -111,11 +127,14 @@ pub enum Database {
FacetIdExistsDocids, FacetIdExistsDocids,
FacetIdF64NumberDocids, FacetIdF64NumberDocids,
FacetIdStringDocids, FacetIdStringDocids,
FieldIdDocidFacetStrings,
FieldIdDocidFacetF64s,
} }
impl Database { impl Database {
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> { pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
match self { match self {
Database::Main => index.main.remap_types(),
Database::Documents => index.documents.remap_types(), Database::Documents => index.documents.remap_types(),
Database::ExternalDocumentsIds => index.external_documents_ids.remap_types(), Database::ExternalDocumentsIds => index.external_documents_ids.remap_types(),
Database::ExactWordDocids => index.exact_word_docids.remap_types(), Database::ExactWordDocids => index.exact_word_docids.remap_types(),
@ -129,6 +148,8 @@ impl Database {
Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(), Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(),
Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(), Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(),
Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(),
Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(),
Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(),
} }
} }
} }
@ -200,6 +221,10 @@ impl ExtractorSender {
FacetDocidsSender { sender: self } FacetDocidsSender { sender: self }
} }
pub fn field_id_docid_facet_sender(&self) -> FieldIdDocidFacetSender<'_> {
FieldIdDocidFacetSender(self)
}
pub fn documents(&self) -> DocumentsSender<'_> { pub fn documents(&self) -> DocumentsSender<'_> {
DocumentsSender(self) DocumentsSender(self)
} }
@ -208,6 +233,10 @@ impl ExtractorSender {
EmbeddingSender(&self.sender) EmbeddingSender(&self.sender)
} }
pub fn geo(&self) -> GeoSender<'_> {
GeoSender(&self.sender)
}
fn send_delete_vector(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { fn send_delete_vector(&self, docid: DocumentId) -> StdResult<(), SendError<()>> {
match self match self
.sender .sender
@ -332,6 +361,36 @@ impl DocidsSender for FacetDocidsSender<'_> {
} }
} }
pub struct FieldIdDocidFacetSender<'a>(&'a ExtractorSender);
impl FieldIdDocidFacetSender<'_> {
pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok());
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(&key, &value));
self.0
.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry })
}
pub fn write_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok());
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(&key, &[]));
self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry })
}
pub fn delete_facet_string(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok());
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
self.0
.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry })
}
pub fn delete_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok());
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry })
}
}
pub struct DocumentsSender<'a>(&'a ExtractorSender); pub struct DocumentsSender<'a>(&'a ExtractorSender);
impl DocumentsSender<'_> { impl DocumentsSender<'_> {
@ -418,12 +477,40 @@ impl EmbeddingSender<'_> {
} }
/// Marks all embedders as "to be built" /// Marks all embedders as "to be built"
pub fn finish( pub fn finish(self, configs: Vec<IndexEmbeddingConfig>) -> StdResult<(), SendError<()>> {
self,
user_provided: HashMap<String, RoaringBitmap>,
) -> StdResult<(), SendError<()>> {
self.0 self.0
.send(WriterOperation::ArroyOperation(ArroyOperation::Finish { user_provided })) .send(WriterOperation::ArroyOperation(ArroyOperation::Finish { configs }))
.map_err(|_| SendError(()))
}
}
pub struct GeoSender<'a>(&'a Sender<WriterOperation>);
impl GeoSender<'_> {
pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> {
self.0
.send(WriterOperation::DbOperation(DbOperation {
database: Database::Main,
entry: EntryOperation::Write(KeyValueEntry::from_large_key_value(
GEO_RTREE_KEY.as_bytes(),
value,
)),
}))
.map_err(|_| SendError(()))
}
pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> StdResult<(), SendError<()>> {
let mut buffer = Vec::new();
bitmap.serialize_into(&mut buffer).unwrap();
self.0
.send(WriterOperation::DbOperation(DbOperation {
database: Database::Main,
entry: EntryOperation::Write(KeyValueEntry::from_small_key_value(
GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(),
&buffer,
)),
}))
.map_err(|_| SendError(())) .map_err(|_| SendError(()))
} }
} }

View File

@ -332,18 +332,31 @@ where
} }
vectors.insert( vectors.insert(
name, name,
serde_json::json!({ if entry.implicit {
"regenerate": entry.regenerate, serde_json::json!(entry.embeddings)
// TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object } else {
"embeddings": entry.embeddings, serde_json::json!({
}), "regenerate": entry.regenerate,
// TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object
"embeddings": entry.embeddings,
})
},
); );
} }
if vectors.is_empty() {
break 'inject_vectors;
}
vectors_value = serde_json::value::to_raw_value(&vectors).unwrap(); vectors_value = serde_json::value::to_raw_value(&vectors).unwrap();
unordered_field_buffer.push((vectors_fid, &vectors_value)); unordered_field_buffer.push((vectors_fid, &vectors_value));
} }
if let Some(geo_value) = document.geo_field()? {
let fid = fields_ids_map.id_or_insert("_geo").ok_or(UserError::AttributeLimitReached)?;
unordered_field_buffer.push((fid, geo_value));
}
unordered_field_buffer.sort_by_key(|(fid, _)| *fid); unordered_field_buffer.sort_by_key(|(fid, _)| *fid);
for (fid, value) in unordered_field_buffer.iter() { for (fid, value) in unordered_field_buffer.iter() {
writer.insert(*fid, value.get().as_bytes()).unwrap(); writer.insert(*fid, value.get().as_bytes()).unwrap();
@ -398,6 +411,7 @@ impl<'doc> Versions<'doc> {
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.data.is_empty() self.data.is_empty()
} }
pub fn top_level_field(&self, k: &str) -> Option<&'doc RawValue> { pub fn top_level_field(&self, k: &str) -> Option<&'doc RawValue> {
if k == RESERVED_VECTORS_FIELD_NAME || k == "_geo" { if k == RESERVED_VECTORS_FIELD_NAME || k == "_geo" {
return None; return None;

View File

@ -97,7 +97,7 @@ impl<'doc> Insertion<'doc> {
doc_alloc: &'doc Bump, doc_alloc: &'doc Bump,
embedders: &'doc EmbeddingConfigs, embedders: &'doc EmbeddingConfigs,
) -> Result<Option<VectorDocumentFromVersions<'doc>>> { ) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
VectorDocumentFromVersions::new(&self.new, doc_alloc, embedders) VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
} }
} }
@ -169,7 +169,7 @@ impl<'doc> Update<'doc> {
doc_alloc: &'doc Bump, doc_alloc: &'doc Bump,
embedders: &'doc EmbeddingConfigs, embedders: &'doc EmbeddingConfigs,
) -> Result<Option<VectorDocumentFromVersions<'doc>>> { ) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
VectorDocumentFromVersions::new(&self.new, doc_alloc, embedders) VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
} }
pub fn merged_vectors<Mapper: FieldIdMapper>( pub fn merged_vectors<Mapper: FieldIdMapper>(
@ -181,10 +181,22 @@ impl<'doc> Update<'doc> {
embedders: &'doc EmbeddingConfigs, embedders: &'doc EmbeddingConfigs,
) -> Result<Option<MergedVectorDocument<'doc>>> { ) -> Result<Option<MergedVectorDocument<'doc>>> {
if self.has_deletion { if self.has_deletion {
MergedVectorDocument::without_db(&self.new, doc_alloc, embedders) MergedVectorDocument::without_db(
self.external_document_id,
&self.new,
doc_alloc,
embedders,
)
} else { } else {
MergedVectorDocument::with_db( MergedVectorDocument::with_db(
self.docid, index, rtxn, mapper, &self.new, doc_alloc, embedders, self.docid,
self.external_document_id,
index,
rtxn,
mapper,
&self.new,
doc_alloc,
embedders,
) )
} }
} }

View File

@ -54,7 +54,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
DocumentChange::Deletion(deletion) => { DocumentChange::Deletion(deletion) => {
let docid = deletion.docid(); let docid = deletion.docid();
let content = deletion.current( let content = deletion.current(
&context.txn, &context.rtxn,
context.index, context.index,
&context.db_fields_ids_map, &context.db_fields_ids_map,
)?; )?;
@ -72,7 +72,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
DocumentChange::Update(update) => { DocumentChange::Update(update) => {
let docid = update.docid(); let docid = update.docid();
let content = let content =
update.current(&context.txn, context.index, &context.db_fields_ids_map)?; update.current(&context.rtxn, context.index, &context.db_fields_ids_map)?;
for res in content.iter_top_level_fields() { for res in content.iter_top_level_fields() {
let (f, _) = res?; let (f, _) = res?;
let entry = document_extractor_data let entry = document_extractor_data
@ -92,9 +92,9 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
} }
let content = let content =
update.merged(&context.txn, context.index, &context.db_fields_ids_map)?; update.merged(&context.rtxn, context.index, &context.db_fields_ids_map)?;
let vector_content = update.merged_vectors( let vector_content = update.merged_vectors(
&context.txn, &context.rtxn,
context.index, context.index,
&context.db_fields_ids_map, &context.db_fields_ids_map,
&context.doc_alloc, &context.doc_alloc,

View File

@ -1,16 +1,21 @@
use std::cell::RefCell; use std::cell::RefCell;
use std::collections::HashSet; use std::collections::HashSet;
use std::mem::size_of;
use std::ops::DerefMut as _; use std::ops::DerefMut as _;
use bumpalo::collections::Vec as BVec;
use bumpalo::Bump; use bumpalo::Bump;
use heed::RoTxn; use hashbrown::HashMap;
use heed::{BytesDecode, RoTxn};
use serde_json::Value; use serde_json::Value;
use super::super::cache::BalancedCaches; use super::super::cache::BalancedCaches;
use super::facet_document::extract_document_facets; use super::facet_document::extract_document_facets;
use super::FacetKind; use super::FacetKind;
use crate::facet::value_encoding::f64_into_bytes; use crate::facet::value_encoding::f64_into_bytes;
use crate::update::new::extract::DocidsExtractor; use crate::heed_codec::facet::OrderedF64Codec;
use crate::update::del_add::DelAdd;
use crate::update::new::channel::FieldIdDocidFacetSender;
use crate::update::new::indexer::document_changes::{ use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext, extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext,
Progress, ThreadLocal, Progress, ThreadLocal,
@ -22,6 +27,7 @@ use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH};
pub struct FacetedExtractorData<'a> { pub struct FacetedExtractorData<'a> {
attributes_to_extract: &'a [&'a str], attributes_to_extract: &'a [&'a str],
sender: &'a FieldIdDocidFacetSender<'a>,
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
buckets: usize, buckets: usize,
} }
@ -48,6 +54,7 @@ impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> {
context, context,
self.attributes_to_extract, self.attributes_to_extract,
change, change,
self.sender,
)? )?
} }
Ok(()) Ok(())
@ -61,12 +68,15 @@ impl FacetedDocidsExtractor {
context: &DocumentChangeContext<RefCell<BalancedCaches>>, context: &DocumentChangeContext<RefCell<BalancedCaches>>,
attributes_to_extract: &[&str], attributes_to_extract: &[&str],
document_change: DocumentChange, document_change: DocumentChange,
sender: &FieldIdDocidFacetSender,
) -> Result<()> { ) -> Result<()> {
let index = &context.index; let index = &context.index;
let rtxn = &context.txn; let rtxn = &context.rtxn;
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
let mut cached_sorter = context.data.borrow_mut_or_yield(); let mut cached_sorter = context.data.borrow_mut_or_yield();
match document_change { let mut del_add_facet_value = DelAddFacetValue::new(&context.doc_alloc);
let docid = document_change.docid();
let res = match document_change {
DocumentChange::Deletion(inner) => extract_document_facets( DocumentChange::Deletion(inner) => extract_document_facets(
attributes_to_extract, attributes_to_extract,
inner.current(rtxn, index, context.db_fields_ids_map)?, inner.current(rtxn, index, context.db_fields_ids_map)?,
@ -76,7 +86,9 @@ impl FacetedDocidsExtractor {
&context.doc_alloc, &context.doc_alloc,
cached_sorter.deref_mut(), cached_sorter.deref_mut(),
BalancedCaches::insert_del_u32, BalancedCaches::insert_del_u32,
inner.docid(), &mut del_add_facet_value,
DelAddFacetValue::insert_del,
docid,
fid, fid,
value, value,
) )
@ -92,7 +104,9 @@ impl FacetedDocidsExtractor {
&context.doc_alloc, &context.doc_alloc,
cached_sorter.deref_mut(), cached_sorter.deref_mut(),
BalancedCaches::insert_del_u32, BalancedCaches::insert_del_u32,
inner.docid(), &mut del_add_facet_value,
DelAddFacetValue::insert_del,
docid,
fid, fid,
value, value,
) )
@ -108,7 +122,9 @@ impl FacetedDocidsExtractor {
&context.doc_alloc, &context.doc_alloc,
cached_sorter.deref_mut(), cached_sorter.deref_mut(),
BalancedCaches::insert_add_u32, BalancedCaches::insert_add_u32,
inner.docid(), &mut del_add_facet_value,
DelAddFacetValue::insert_add,
docid,
fid, fid,
value, value,
) )
@ -124,24 +140,31 @@ impl FacetedDocidsExtractor {
&context.doc_alloc, &context.doc_alloc,
cached_sorter.deref_mut(), cached_sorter.deref_mut(),
BalancedCaches::insert_add_u32, BalancedCaches::insert_add_u32,
inner.docid(), &mut del_add_facet_value,
DelAddFacetValue::insert_add,
docid,
fid, fid,
value, value,
) )
}, },
), ),
} };
del_add_facet_value.send_data(docid, sender, &context.doc_alloc).unwrap();
res
} }
fn facet_fn_with_options<'extractor>( fn facet_fn_with_options<'extractor, 'doc>(
doc_alloc: &Bump, doc_alloc: &'doc Bump,
cached_sorter: &mut BalancedCaches<'extractor>, cached_sorter: &mut BalancedCaches<'extractor>,
cache_fn: impl Fn(&mut BalancedCaches<'extractor>, &[u8], u32) -> Result<()>, cache_fn: impl Fn(&mut BalancedCaches<'extractor>, &[u8], u32) -> Result<()>,
del_add_facet_value: &mut DelAddFacetValue<'doc>,
facet_fn: impl Fn(&mut DelAddFacetValue<'doc>, FieldId, BVec<'doc, u8>, FacetKind),
docid: DocumentId, docid: DocumentId,
fid: FieldId, fid: FieldId,
value: &Value, value: &Value,
) -> Result<()> { ) -> Result<()> {
let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); let mut buffer = BVec::new_in(doc_alloc);
// Exists // Exists
// key: fid // key: fid
buffer.push(FacetKind::Exists as u8); buffer.push(FacetKind::Exists as u8);
@ -152,15 +175,21 @@ impl FacetedDocidsExtractor {
// Number // Number
// key: fid - level - orderedf64 - orignalf64 // key: fid - level - orderedf64 - orignalf64
Value::Number(number) => { Value::Number(number) => {
if let Some((n, ordered)) = let mut ordered = [0u8; 16];
number.as_f64().and_then(|n| f64_into_bytes(n).map(|ordered| (n, ordered))) if number
.as_f64()
.and_then(|n| OrderedF64Codec::serialize_into(n, &mut ordered).ok())
.is_some()
{ {
let mut number = BVec::with_capacity_in(16, doc_alloc);
number.extend_from_slice(&ordered);
facet_fn(del_add_facet_value, fid, number, FacetKind::Number);
buffer.clear(); buffer.clear();
buffer.push(FacetKind::Number as u8); buffer.push(FacetKind::Number as u8);
buffer.extend_from_slice(&fid.to_be_bytes()); buffer.extend_from_slice(&fid.to_be_bytes());
buffer.push(0); // level 0 buffer.push(0); // level 0
buffer.extend_from_slice(&ordered); buffer.extend_from_slice(&ordered);
buffer.extend_from_slice(&n.to_be_bytes());
cache_fn(cached_sorter, &buffer, docid) cache_fn(cached_sorter, &buffer, docid)
} else { } else {
Ok(()) Ok(())
@ -169,6 +198,10 @@ impl FacetedDocidsExtractor {
// String // String
// key: fid - level - truncated_string // key: fid - level - truncated_string
Value::String(s) => { Value::String(s) => {
let mut string = BVec::new_in(doc_alloc);
string.extend_from_slice(s.as_bytes());
facet_fn(del_add_facet_value, fid, string, FacetKind::String);
let normalized = crate::normalize_facet(s); let normalized = crate::normalize_facet(s);
let truncated = truncate_str(&normalized); let truncated = truncate_str(&normalized);
buffer.clear(); buffer.clear();
@ -211,6 +244,83 @@ impl FacetedDocidsExtractor {
} }
} }
struct DelAddFacetValue<'doc> {
strings: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>,
f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>,
}
impl<'doc> DelAddFacetValue<'doc> {
fn new(doc_alloc: &'doc Bump) -> Self {
Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc) }
}
fn insert_add(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) {
let cache = match kind {
FacetKind::String => &mut self.strings,
FacetKind::Number => &mut self.f64s,
_ => return,
};
let key = (fid, value);
if let Some(DelAdd::Deletion) = cache.get(&key) {
cache.remove(&key);
} else {
cache.insert(key, DelAdd::Addition);
}
}
fn insert_del(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) {
let cache = match kind {
FacetKind::String => &mut self.strings,
FacetKind::Number => &mut self.f64s,
_ => return,
};
let key = (fid, value);
if let Some(DelAdd::Addition) = cache.get(&key) {
cache.remove(&key);
} else {
cache.insert(key, DelAdd::Deletion);
}
}
fn send_data(
self,
docid: DocumentId,
sender: &FieldIdDocidFacetSender,
doc_alloc: &Bump,
) -> std::result::Result<(), crossbeam_channel::SendError<()>> {
let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
for ((fid, value), deladd) in self.strings {
if let Ok(s) = std::str::from_utf8(&value) {
buffer.clear();
buffer.extend_from_slice(&fid.to_be_bytes());
buffer.extend_from_slice(&docid.to_be_bytes());
let normalized = crate::normalize_facet(s);
let truncated = truncate_str(&normalized);
buffer.extend_from_slice(truncated.as_bytes());
match deladd {
DelAdd::Deletion => sender.delete_facet_string(&buffer)?,
DelAdd::Addition => sender.write_facet_string(&buffer, &value)?,
}
}
}
for ((fid, value), deladd) in self.f64s {
buffer.clear();
buffer.extend_from_slice(&fid.to_be_bytes());
buffer.extend_from_slice(&docid.to_be_bytes());
buffer.extend_from_slice(&value);
match deladd {
DelAdd::Deletion => sender.delete_facet_f64(&buffer)?,
DelAdd::Addition => sender.write_facet_f64(&buffer)?,
}
}
Ok(())
}
}
/// Truncates a string to the biggest valid LMDB key size. /// Truncates a string to the biggest valid LMDB key size.
fn truncate_str(s: &str) -> &str { fn truncate_str(s: &str) -> &str {
let index = s let index = s
@ -223,13 +333,23 @@ fn truncate_str(s: &str) -> &str {
&s[..index.unwrap_or(0)] &s[..index.unwrap_or(0)]
} }
impl DocidsExtractor for FacetedDocidsExtractor { impl FacetedDocidsExtractor {
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( pub fn run_extraction<
'pl,
'fid,
'indexer,
'index,
'extractor,
DC: DocumentChanges<'pl>,
MSP,
SP,
>(
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
document_changes: &DC, document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
sender: &FieldIdDocidFacetSender,
finished_steps: u16, finished_steps: u16,
total_steps: u16, total_steps: u16,
step_name: &'static str, step_name: &'static str,
@ -254,6 +374,7 @@ impl DocidsExtractor for FacetedDocidsExtractor {
attributes_to_extract: &attributes_to_extract, attributes_to_extract: &attributes_to_extract,
grenad_parameters, grenad_parameters,
buckets: rayon::current_num_threads(), buckets: rayon::current_num_threads(),
sender,
}; };
extract( extract(
document_changes, document_changes,

View File

@ -10,7 +10,8 @@ pub fn extract_document_facets<'doc>(
field_id_map: &mut GlobalFieldsIdsMap, field_id_map: &mut GlobalFieldsIdsMap,
facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>, facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
) -> Result<()> { ) -> Result<()> {
for res in document.iter_top_level_fields() { let geo = document.geo_field().transpose().map(|res| res.map(|rval| ("_geo", rval)));
for res in document.iter_top_level_fields().chain(geo) {
let (field_name, value) = res?; let (field_name, value) = res?;
let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) { let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {

View File

@ -28,7 +28,6 @@ impl From<u8> for FacetKind {
impl FacetKind { impl FacetKind {
pub fn extract_from_key(key: &[u8]) -> (FacetKind, &[u8]) { pub fn extract_from_key(key: &[u8]) -> (FacetKind, &[u8]) {
debug_assert!(key.len() > 3);
(FacetKind::from(key[0]), &key[1..]) (FacetKind::from(key[0]), &key[1..])
} }
} }

View File

@ -0,0 +1,320 @@
use std::cell::RefCell;
use std::fs::File;
use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _};
use std::{iter, mem, result};
use bumpalo::Bump;
use bytemuck::{bytes_of, from_bytes, pod_read_unaligned, Pod, Zeroable};
use heed::RoTxn;
use serde_json::value::RawValue;
use serde_json::Value;
use crate::error::GeoError;
use crate::update::new::document::Document;
use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor, MostlySend};
use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::DocumentChange;
use crate::update::GrenadParameters;
use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Object, Result};
pub struct GeoExtractor {
grenad_parameters: GrenadParameters,
}
impl GeoExtractor {
pub fn new(
rtxn: &RoTxn,
index: &Index,
grenad_parameters: GrenadParameters,
) -> Result<Option<Self>> {
let is_sortable = index.sortable_fields(rtxn)?.contains("_geo");
let is_filterable = index.filterable_fields(rtxn)?.contains("_geo");
if is_sortable || is_filterable {
Ok(Some(GeoExtractor { grenad_parameters }))
} else {
Ok(None)
}
}
}
#[derive(Pod, Zeroable, Copy, Clone)]
#[repr(C, packed)]
pub struct ExtractedGeoPoint {
pub docid: DocumentId,
pub lat_lng: [f64; 2],
}
impl From<ExtractedGeoPoint> for GeoPoint {
/// Converts the latitude and longitude back to an xyz GeoPoint.
fn from(value: ExtractedGeoPoint) -> Self {
let [lat, lng] = value.lat_lng;
let point = [lat, lng];
let xyz_point = lat_lng_to_xyz(&point);
GeoPoint::new(xyz_point, (value.docid, point))
}
}
pub struct GeoExtractorData<'extractor> {
/// The set of documents ids that were removed. If a document sees its geo
/// point being updated, we first put it in the deleted and then in the inserted.
removed: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
inserted: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
/// TODO Do the doc
spilled_removed: Option<BufWriter<File>>,
/// TODO Do the doc
spilled_inserted: Option<BufWriter<File>>,
}
impl<'extractor> GeoExtractorData<'extractor> {
pub fn freeze(self) -> Result<FrozenGeoExtractorData<'extractor>> {
let GeoExtractorData { removed, inserted, spilled_removed, spilled_inserted } = self;
Ok(FrozenGeoExtractorData {
removed: removed.into_bump_slice(),
inserted: inserted.into_bump_slice(),
spilled_removed: spilled_removed
.map(|bw| bw.into_inner().map(BufReader::new).map_err(|iie| iie.into_error()))
.transpose()?,
spilled_inserted: spilled_inserted
.map(|bw| bw.into_inner().map(BufReader::new).map_err(|iie| iie.into_error()))
.transpose()?,
})
}
}
unsafe impl MostlySend for GeoExtractorData<'_> {}
pub struct FrozenGeoExtractorData<'extractor> {
pub removed: &'extractor [ExtractedGeoPoint],
pub inserted: &'extractor [ExtractedGeoPoint],
pub spilled_removed: Option<BufReader<File>>,
pub spilled_inserted: Option<BufReader<File>>,
}
impl<'extractor> FrozenGeoExtractorData<'extractor> {
pub fn iter_and_clear_removed(
&mut self,
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
mem::take(&mut self.removed)
.iter()
.copied()
.map(Ok)
.chain(iterator_over_spilled_geopoints(&mut self.spilled_removed))
}
pub fn iter_and_clear_inserted(
&mut self,
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
mem::take(&mut self.inserted)
.iter()
.copied()
.map(Ok)
.chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted))
}
}
fn iterator_over_spilled_geopoints(
spilled: &mut Option<BufReader<File>>,
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
let mut spilled = spilled.take();
iter::from_fn(move || match &mut spilled {
Some(file) => {
let geopoint_bytes = &mut [0u8; mem::size_of::<ExtractedGeoPoint>()];
match file.read_exact(geopoint_bytes) {
Ok(()) => Some(Ok(pod_read_unaligned(geopoint_bytes))),
Err(e) if e.kind() == ErrorKind::UnexpectedEof => None,
Err(e) => Some(Err(e)),
}
}
None => None,
})
}
impl<'extractor> Extractor<'extractor> for GeoExtractor {
type Data = RefCell<GeoExtractorData<'extractor>>;
fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
Ok(RefCell::new(GeoExtractorData {
removed: bumpalo::collections::Vec::new_in(extractor_alloc),
// inserted: Uell::new_in(extractor_alloc),
inserted: bumpalo::collections::Vec::new_in(extractor_alloc),
spilled_inserted: None,
spilled_removed: None,
}))
}
fn process<'doc>(
&'doc self,
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
context: &'doc DocumentChangeContext<Self::Data>,
) -> Result<()> {
let rtxn = &context.rtxn;
let index = context.index;
let max_memory = self.grenad_parameters.max_memory;
let db_fields_ids_map = context.db_fields_ids_map;
let mut data_ref = context.data.borrow_mut_or_yield();
for change in changes {
if max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm) {
// We must spill as we allocated too much memory
data_ref.spilled_removed = tempfile::tempfile().map(BufWriter::new).map(Some)?;
data_ref.spilled_inserted = tempfile::tempfile().map(BufWriter::new).map(Some)?;
}
match change? {
DocumentChange::Deletion(deletion) => {
let docid = deletion.docid();
let external_id = deletion.external_document_id();
let current = deletion.current(rtxn, index, db_fields_ids_map)?;
let current_geo = current
.geo_field()?
.map(|geo| extract_geo_coordinates(external_id, geo))
.transpose()?;
if let Some(lat_lng) = current_geo.flatten() {
let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_removed {
Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.removed.push(geopoint),
}
}
}
DocumentChange::Update(update) => {
let current = update.current(rtxn, index, db_fields_ids_map)?;
let external_id = update.external_document_id();
let docid = update.docid();
let current_geo = current
.geo_field()?
.map(|geo| extract_geo_coordinates(external_id, geo))
.transpose()?;
let updated_geo = update
.updated()
.geo_field()?
.map(|geo| extract_geo_coordinates(external_id, geo))
.transpose()?;
if current_geo != updated_geo {
// If the current and new geo points are different it means that
// we need to replace the current by the new point and therefore
// delete the current point from the RTree.
if let Some(lat_lng) = current_geo.flatten() {
let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_removed {
Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.removed.push(geopoint),
}
}
if let Some(lat_lng) = updated_geo.flatten() {
let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_inserted {
Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.inserted.push(geopoint),
}
}
}
}
DocumentChange::Insertion(insertion) => {
let external_id = insertion.external_document_id();
let docid = insertion.docid();
let inserted_geo = insertion
.inserted()
.geo_field()?
.map(|geo| extract_geo_coordinates(external_id, geo))
.transpose()?;
if let Some(lat_lng) = inserted_geo.flatten() {
let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_inserted {
Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.inserted.push(geopoint),
}
}
}
}
}
Ok(())
}
}
/// Extracts and validate the latitude and latitude from a document geo field.
///
/// It can be of the form `{ "lat": 0.0, "lng": "1.0" }`.
fn extract_geo_coordinates(external_id: &str, raw_value: &RawValue) -> Result<Option<[f64; 2]>> {
let mut geo = match serde_json::from_str(raw_value.get()).map_err(InternalError::SerdeJson)? {
Value::Null => return Ok(None),
Value::Object(map) => map,
value => {
return Err(
GeoError::NotAnObject { document_id: Value::from(external_id), value }.into()
)
}
};
let [lat, lng] = match (geo.remove("lat"), geo.remove("lng")) {
(Some(lat), Some(lng)) => {
if geo.is_empty() {
[lat, lng]
} else {
return Err(GeoError::UnexpectedExtraFields {
document_id: Value::from(external_id),
value: Value::from(geo),
}
.into());
}
}
(Some(_), None) => {
return Err(GeoError::MissingLongitude { document_id: Value::from(external_id) }.into())
}
(None, Some(_)) => {
return Err(GeoError::MissingLatitude { document_id: Value::from(external_id) }.into())
}
(None, None) => {
return Err(GeoError::MissingLatitudeAndLongitude {
document_id: Value::from(external_id),
}
.into())
}
};
match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
(Ok(lat), Ok(lng)) => Ok(Some([lat, lng])),
(Ok(_), Err(value)) => {
Err(GeoError::BadLongitude { document_id: Value::from(external_id), value }.into())
}
(Err(value), Ok(_)) => {
Err(GeoError::BadLatitude { document_id: Value::from(external_id), value }.into())
}
(Err(lat), Err(lng)) => Err(GeoError::BadLatitudeAndLongitude {
document_id: Value::from(external_id),
lat,
lng,
}
.into()),
}
}
/// Extracts and validate that a serde JSON Value is actually a finite f64.
pub fn extract_finite_float_from_value(value: Value) -> result::Result<f64, Value> {
let number = match value {
Value::Number(ref n) => match n.as_f64() {
Some(number) => number,
None => return Err(value),
},
Value::String(ref s) => match s.parse::<f64>() {
Ok(number) => number,
Err(_) => return Err(value),
},
value => return Err(value),
};
if number.is_finite() {
Ok(number)
} else {
Err(value)
}
}

View File

@ -1,6 +1,7 @@
mod cache; mod cache;
mod documents; mod documents;
mod faceted; mod faceted;
mod geo;
mod searchable; mod searchable;
mod vectors; mod vectors;
@ -8,6 +9,7 @@ use bumpalo::Bump;
pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap}; pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap};
pub use documents::*; pub use documents::*;
pub use faceted::*; pub use faceted::*;
pub use geo::*;
pub use searchable::*; pub use searchable::*;
pub use vectors::EmbeddingExtractor; pub use vectors::EmbeddingExtractor;

View File

@ -326,7 +326,7 @@ impl WordDocidsExtractors {
document_change: DocumentChange, document_change: DocumentChange,
) -> Result<()> { ) -> Result<()> {
let index = &context.index; let index = &context.index;
let rtxn = &context.txn; let rtxn = &context.rtxn;
let mut cached_sorter_ref = context.data.borrow_mut_or_yield(); let mut cached_sorter_ref = context.data.borrow_mut_or_yield();
let cached_sorter = cached_sorter_ref.as_mut().unwrap(); let cached_sorter = cached_sorter_ref.as_mut().unwrap();
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();

View File

@ -39,7 +39,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
let doc_alloc = &context.doc_alloc; let doc_alloc = &context.doc_alloc;
let index = context.index; let index = context.index;
let rtxn = &context.txn; let rtxn = &context.rtxn;
let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc); let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc);
let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc); let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);

View File

@ -2,13 +2,13 @@ use std::cell::RefCell;
use bumpalo::collections::Vec as BVec; use bumpalo::collections::Vec as BVec;
use bumpalo::Bump; use bumpalo::Bump;
use hashbrown::HashMap; use hashbrown::{DefaultHashBuilder, HashMap};
use super::cache::DelAddRoaringBitmap; use super::cache::DelAddRoaringBitmap;
use crate::error::FaultSource; use crate::error::FaultSource;
use crate::prompt::Prompt; use crate::prompt::Prompt;
use crate::update::new::channel::EmbeddingSender; use crate::update::new::channel::EmbeddingSender;
use crate::update::new::indexer::document_changes::{Extractor, MostlySend}; use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor, MostlySend};
use crate::update::new::vector_document::VectorDocument; use crate::update::new::vector_document::VectorDocument;
use crate::update::new::DocumentChange; use crate::update::new::DocumentChange;
use crate::vector::error::{ use crate::vector::error::{
@ -37,7 +37,7 @@ impl<'a> EmbeddingExtractor<'a> {
} }
pub struct EmbeddingExtractorData<'extractor>( pub struct EmbeddingExtractorData<'extractor>(
pub HashMap<String, DelAddRoaringBitmap, hashbrown::DefaultHashBuilder, &'extractor Bump>, pub HashMap<String, DelAddRoaringBitmap, DefaultHashBuilder, &'extractor Bump>,
); );
unsafe impl MostlySend for EmbeddingExtractorData<'_> {} unsafe impl MostlySend for EmbeddingExtractorData<'_> {}
@ -52,9 +52,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
fn process<'doc>( fn process<'doc>(
&'doc self, &'doc self,
changes: impl Iterator<Item = crate::Result<DocumentChange<'doc>>>, changes: impl Iterator<Item = crate::Result<DocumentChange<'doc>>>,
context: &'doc crate::update::new::indexer::document_changes::DocumentChangeContext< context: &'doc DocumentChangeContext<Self::Data>,
Self::Data,
>,
) -> crate::Result<()> { ) -> crate::Result<()> {
let embedders = self.embedders.inner_as_ref(); let embedders = self.embedders.inner_as_ref();
let mut unused_vectors_distribution = let mut unused_vectors_distribution =
@ -63,7 +61,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc);
for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { for (embedder_name, (embedder, prompt, _is_quantized)) in embedders {
let embedder_id = let embedder_id =
context.index.embedder_category_id.get(&context.txn, embedder_name)?.ok_or_else( context.index.embedder_category_id.get(&context.rtxn, embedder_name)?.ok_or_else(
|| InternalError::DatabaseMissingEntry { || InternalError::DatabaseMissingEntry {
db_name: "embedder_category_id", db_name: "embedder_category_id",
key: None, key: None,
@ -85,12 +83,17 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
for change in changes { for change in changes {
let change = change?; let change = change?;
match change { match change {
DocumentChange::Deletion(_deletion) => { DocumentChange::Deletion(deletion) => {
// handled by document sender // vector deletion is handled by document sender,
// we still need to accomodate deletion from user_provided
for chunks in &mut all_chunks {
// regenerate: true means we delete from user_provided
chunks.set_regenerate(deletion.docid(), true);
}
} }
DocumentChange::Update(update) => { DocumentChange::Update(update) => {
let old_vectors = update.current_vectors( let old_vectors = update.current_vectors(
&context.txn, &context.rtxn,
context.index, context.index,
context.db_fields_ids_map, context.db_fields_ids_map,
&context.doc_alloc, &context.doc_alloc,
@ -110,11 +113,8 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
new_vectors.vectors_for_key(embedder_name).transpose() new_vectors.vectors_for_key(embedder_name).transpose()
}) { }) {
let new_vectors = new_vectors?; let new_vectors = new_vectors?;
match (old_vectors.regenerate, new_vectors.regenerate) { if old_vectors.regenerate != new_vectors.regenerate {
(true, true) | (false, false) => todo!(), chunks.set_regenerate(update.docid(), new_vectors.regenerate);
_ => {
chunks.set_regenerate(update.docid(), new_vectors.regenerate);
}
} }
// do we have set embeddings? // do we have set embeddings?
if let Some(embeddings) = new_vectors.embeddings { if let Some(embeddings) = new_vectors.embeddings {
@ -124,13 +124,13 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
.into_vec(&context.doc_alloc, embedder_name) .into_vec(&context.doc_alloc, embedder_name)
.map_err(|error| UserError::InvalidVectorsEmbedderConf { .map_err(|error| UserError::InvalidVectorsEmbedderConf {
document_id: update.external_document_id().to_string(), document_id: update.external_document_id().to_string(),
error, error: error.to_string(),
})?, })?,
); );
} else if new_vectors.regenerate { } else if new_vectors.regenerate {
let new_rendered = prompt.render_document( let new_rendered = prompt.render_document(
update.current( update.current(
&context.txn, &context.rtxn,
context.index, context.index,
context.db_fields_ids_map, context.db_fields_ids_map,
)?, )?,
@ -139,7 +139,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
)?; )?;
let old_rendered = prompt.render_document( let old_rendered = prompt.render_document(
update.merged( update.merged(
&context.txn, &context.rtxn,
context.index, context.index,
context.db_fields_ids_map, context.db_fields_ids_map,
)?, )?,
@ -149,6 +149,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
if new_rendered != old_rendered { if new_rendered != old_rendered {
chunks.set_autogenerated( chunks.set_autogenerated(
update.docid(), update.docid(),
update.external_document_id(),
new_rendered, new_rendered,
&unused_vectors_distribution, &unused_vectors_distribution,
)?; )?;
@ -157,7 +158,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
} else if old_vectors.regenerate { } else if old_vectors.regenerate {
let old_rendered = prompt.render_document( let old_rendered = prompt.render_document(
update.current( update.current(
&context.txn, &context.rtxn,
context.index, context.index,
context.db_fields_ids_map, context.db_fields_ids_map,
)?, )?,
@ -166,7 +167,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
)?; )?;
let new_rendered = prompt.render_document( let new_rendered = prompt.render_document(
update.merged( update.merged(
&context.txn, &context.rtxn,
context.index, context.index,
context.db_fields_ids_map, context.db_fields_ids_map,
)?, )?,
@ -176,6 +177,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
if new_rendered != old_rendered { if new_rendered != old_rendered {
chunks.set_autogenerated( chunks.set_autogenerated(
update.docid(), update.docid(),
update.external_document_id(),
new_rendered, new_rendered,
&unused_vectors_distribution, &unused_vectors_distribution,
)?; )?;
@ -208,7 +210,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
document_id: insertion document_id: insertion
.external_document_id() .external_document_id()
.to_string(), .to_string(),
error, error: error.to_string(),
})?, })?,
); );
} else if new_vectors.regenerate { } else if new_vectors.regenerate {
@ -219,6 +221,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
)?; )?;
chunks.set_autogenerated( chunks.set_autogenerated(
insertion.docid(), insertion.docid(),
insertion.external_document_id(),
rendered, rendered,
&unused_vectors_distribution, &unused_vectors_distribution,
)?; )?;
@ -231,6 +234,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
)?; )?;
chunks.set_autogenerated( chunks.set_autogenerated(
insertion.docid(), insertion.docid(),
insertion.external_document_id(),
rendered, rendered,
&unused_vectors_distribution, &unused_vectors_distribution,
)?; )?;
@ -266,6 +270,7 @@ struct Chunks<'a, 'extractor> {
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>, user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
threads: &'a ThreadPoolNoAbort, threads: &'a ThreadPoolNoAbort,
sender: &'a EmbeddingSender<'a>, sender: &'a EmbeddingSender<'a>,
has_manual_generation: Option<&'a str>,
} }
impl<'a, 'extractor> Chunks<'a, 'extractor> { impl<'a, 'extractor> Chunks<'a, 'extractor> {
@ -295,15 +300,22 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
embedder_id, embedder_id,
embedder_name, embedder_name,
user_provided, user_provided,
has_manual_generation: None,
} }
} }
pub fn set_autogenerated( pub fn set_autogenerated(
&mut self, &mut self,
docid: DocumentId, docid: DocumentId,
external_docid: &'a str,
rendered: &'a str, rendered: &'a str,
unused_vectors_distribution: &UnusedVectorsDistributionBump, unused_vectors_distribution: &UnusedVectorsDistributionBump,
) -> Result<()> { ) -> Result<()> {
let is_manual = matches!(&self.embedder, &Embedder::UserProvided(_));
if is_manual {
self.has_manual_generation.get_or_insert(external_docid);
}
if self.texts.len() < self.texts.capacity() { if self.texts.len() < self.texts.capacity() {
self.texts.push(rendered); self.texts.push(rendered);
self.ids.push(docid); self.ids.push(docid);
@ -320,6 +332,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
unused_vectors_distribution, unused_vectors_distribution,
self.threads, self.threads,
self.sender, self.sender,
self.has_manual_generation.take(),
) )
} }
@ -337,6 +350,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
unused_vectors_distribution, unused_vectors_distribution,
self.threads, self.threads,
self.sender, self.sender,
self.has_manual_generation,
); );
// optimization: don't run bvec dtors as they only contain bumpalo allocated stuff // optimization: don't run bvec dtors as they only contain bumpalo allocated stuff
std::mem::forget(self); std::mem::forget(self);
@ -354,7 +368,46 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
unused_vectors_distribution: &UnusedVectorsDistributionBump, unused_vectors_distribution: &UnusedVectorsDistributionBump,
threads: &ThreadPoolNoAbort, threads: &ThreadPoolNoAbort,
sender: &EmbeddingSender<'a>, sender: &EmbeddingSender<'a>,
has_manual_generation: Option<&'a str>,
) -> Result<()> { ) -> Result<()> {
if let Some(external_docid) = has_manual_generation {
let mut msg = format!(
r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{}",
external_docid,
if ids.len() > 1 {
format!(" and at least {} other document(s)", ids.len() - 1)
} else {
"".to_string()
}
);
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
let mut hint_count = 0;
for (vector_misspelling, count) in possible_embedding_mistakes.vector_mistakes().take(2)
{
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
hint_count += 1;
}
for (embedder_misspelling, count) in possible_embedding_mistakes
.embedder_mistakes_bump(embedder_name, unused_vectors_distribution)
.take(2)
{
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
hint_count += 1;
}
if hint_count == 0 {
msg += &format!(
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
);
}
return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)));
}
let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) { let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) {
Ok(embeddings) => { Ok(embeddings) => {
for (docid, embedding) in ids.into_iter().zip(embeddings) { for (docid, embedding) in ids.into_iter().zip(embeddings) {
@ -423,9 +476,9 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
let user_provided = user_provided.0.entry_ref(self.embedder_name).or_default(); let user_provided = user_provided.0.entry_ref(self.embedder_name).or_default();
if regenerate { if regenerate {
// regenerate == !user_provided // regenerate == !user_provided
user_provided.del.get_or_insert(Default::default()).insert(docid); user_provided.insert_del_u32(docid);
} else { } else {
user_provided.add.get_or_insert(Default::default()).insert(docid); user_provided.insert_add_u32(docid);
} }
} }

View File

@ -54,23 +54,12 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
} }
} }
fn extract_key_data<'k>(&self, key: &'k [u8]) -> Result<Option<FacetGroupKey<&'k str>>> { pub fn register_from_key(
match FacetKind::from(key[0]) { &mut self,
// Only strings are searchable deladd: DelAdd,
FacetKind::String => Ok(Some( facet_key: FacetGroupKey<&str>,
FacetGroupKeyCodec::<StrRefCodec>::bytes_decode(&key[1..]) ) -> Result<()> {
.map_err(heed::Error::Encoding)?, let FacetGroupKey { field_id, level: _level, left_bound } = facet_key;
)),
_ => Ok(None),
}
}
pub fn register_from_key(&mut self, deladd: DelAdd, facet_key: &[u8]) -> Result<()> {
let Some(FacetGroupKey { field_id, level: _level, left_bound }) =
self.extract_key_data(facet_key)?
else {
return Ok(());
};
if deladd == DelAdd::Addition { if deladd == DelAdd::Addition {
self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1); self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1);

View File

@ -41,6 +41,11 @@ impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> Visitor<'de>
where where
A: serde::de::MapAccess<'de>, A: serde::de::MapAccess<'de>,
{ {
// We need to remember if we encountered a semantic error, because raw values don't like to be parsed partially
// (trying to do so results in parsing errors).
// So we'll exhaust all keys and values even if we encounter an error, and we'll then return any error we detected.
let mut attribute_limit_reached = false;
let mut document_id_extraction_error = None;
let mut docid = None; let mut docid = None;
while let Some(((level_name, right), (fid, fields_ids_map))) = while let Some(((level_name, right), (fid, fields_ids_map))) =
@ -49,20 +54,36 @@ impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> Visitor<'de>
visitor: MutFieldIdMapVisitor(self.fields_ids_map), visitor: MutFieldIdMapVisitor(self.fields_ids_map),
})? })?
{ {
let Some(_fid) = fid else {
return Ok(Err(crate::UserError::AttributeLimitReached));
};
self.fields_ids_map = fields_ids_map; self.fields_ids_map = fields_ids_map;
let value: &'de RawValue = map.next_value()?; let value: &'de RawValue = map.next_value()?;
if attribute_limit_reached || document_id_extraction_error.is_some() {
continue;
}
let Some(_fid) = fid else {
attribute_limit_reached = true;
continue;
};
match match_component(level_name, right, value, self.indexer, &mut docid) { match match_component(level_name, right, value, self.indexer, &mut docid) {
ControlFlow::Continue(()) => continue, ControlFlow::Continue(()) => continue,
ControlFlow::Break(Err(err)) => return Err(serde::de::Error::custom(err)), ControlFlow::Break(Err(err)) => return Err(serde::de::Error::custom(err)),
ControlFlow::Break(Ok(err)) => return Ok(Ok(Err(err))), ControlFlow::Break(Ok(err)) => {
document_id_extraction_error = Some(err);
continue;
}
} }
} }
// return previously detected errors
if attribute_limit_reached {
return Ok(Err(UserError::AttributeLimitReached));
}
if let Some(document_id_extraction_error) = document_id_extraction_error {
return Ok(Ok(Err(document_id_extraction_error)));
}
Ok(Ok(match docid { Ok(Ok(match docid {
Some(docid) => Ok(docid), Some(docid) => Ok(docid),
None => Err(DocumentIdExtractionError::MissingDocumentId), None => Err(DocumentIdExtractionError::MissingDocumentId),

View File

@ -197,7 +197,7 @@ pub struct DocumentChangeContext<
/// inside of the DB. /// inside of the DB.
pub db_fields_ids_map: &'indexer FieldsIdsMap, pub db_fields_ids_map: &'indexer FieldsIdsMap,
/// A transaction providing data from the DB before all indexing operations /// A transaction providing data from the DB before all indexing operations
pub txn: RoTxn<'indexer>, pub rtxn: RoTxn<'indexer>,
/// Global field id map that is up to date with the current state of the indexing process. /// Global field id map that is up to date with the current state of the indexing process.
/// ///
@ -255,7 +255,7 @@ impl<
let txn = index.read_txn()?; let txn = index.read_txn()?;
Ok(DocumentChangeContext { Ok(DocumentChangeContext {
index, index,
txn, rtxn: txn,
db_fields_ids_map, db_fields_ids_map,
new_fields_ids_map: fields_ids_map, new_fields_ids_map: fields_ids_map,
doc_alloc, doc_alloc,

View File

@ -63,7 +63,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
where where
'pl: 'doc, // the payload must survive the process calls 'pl: 'doc, // the payload must survive the process calls
{ {
let current = context.index.document(&context.txn, *docid)?; let current = context.index.document(&context.rtxn, *docid)?;
let external_document_id = self.primary_key.extract_docid_from_db( let external_document_id = self.primary_key.extract_docid_from_db(
current, current,

View File

@ -12,6 +12,7 @@ use heed::{RoTxn, RwTxn};
use itertools::{merge_join_by, EitherOrBoth}; use itertools::{merge_join_by, EitherOrBoth};
pub use partial_dump::PartialDump; pub use partial_dump::PartialDump;
use rand::SeedableRng as _; use rand::SeedableRng as _;
use raw_collections::RawMap;
use rayon::ThreadPool; use rayon::ThreadPool;
use time::OffsetDateTime; use time::OffsetDateTime;
pub use update_by_function::UpdateByFunction; pub use update_by_function::UpdateByFunction;
@ -24,7 +25,7 @@ use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder};
use super::words_prefix_docids::{ use super::words_prefix_docids::{
compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids,
}; };
use super::{StdResult, TopLevelMap}; use super::StdResult;
use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
@ -32,6 +33,7 @@ use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
use crate::proximity::ProximityPrecision; use crate::proximity::ProximityPrecision;
use crate::update::del_add::DelAdd; use crate::update::del_add::DelAdd;
use crate::update::new::extract::EmbeddingExtractor; use crate::update::new::extract::EmbeddingExtractor;
use crate::update::new::merger::merge_and_send_rtree;
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases};
use crate::update::settings::InnerIndexSettings; use crate::update::settings::InnerIndexSettings;
@ -56,6 +58,7 @@ mod steps {
"extracting words", "extracting words",
"extracting word proximity", "extracting word proximity",
"extracting embeddings", "extracting embeddings",
"writing geo points",
"writing to database", "writing to database",
"writing embeddings to database", "writing embeddings to database",
"waiting for extractors", "waiting for extractors",
@ -92,29 +95,33 @@ mod steps {
step(4) step(4)
} }
pub const fn write_db() -> (u16, &'static str) { pub const fn extract_geo_points() -> (u16, &'static str) {
step(5) step(5)
} }
pub const fn write_embedding_db() -> (u16, &'static str) { pub const fn write_db() -> (u16, &'static str) {
step(6) step(6)
} }
pub const fn waiting_extractors() -> (u16, &'static str) { pub const fn write_embedding_db() -> (u16, &'static str) {
step(7) step(7)
} }
pub const fn post_processing_facets() -> (u16, &'static str) { pub const fn waiting_extractors() -> (u16, &'static str) {
step(8) step(8)
} }
pub const fn post_processing_words() -> (u16, &'static str) { pub const fn post_processing_facets() -> (u16, &'static str) {
step(9) step(9)
} }
pub const fn finalizing() -> (u16, &'static str) { pub const fn post_processing_words() -> (u16, &'static str) {
step(10) step(10)
} }
pub const fn finalizing() -> (u16, &'static str) {
step(11)
}
} }
/// This is the main function of this crate. /// This is the main function of this crate.
@ -143,11 +150,8 @@ where
let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000); let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000);
let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; let metadata_builder = MetadataBuilder::from_index(index, wtxn)?;
let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder);
let new_fields_ids_map = RwLock::new(new_fields_ids_map); let new_fields_ids_map = RwLock::new(new_fields_ids_map);
let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads()); let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads());
let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads()); let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
@ -198,7 +202,7 @@ where
document_extractor_data.docids_delta.apply_to(document_ids); document_extractor_data.docids_delta.apply_to(document_ids);
} }
field_distribution.retain(|_, v| *v == 0); field_distribution.retain(|_, v| *v != 0);
const TEN_GIB: usize = 10 * 1024 * 1024 * 1024; const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
let current_num_threads = rayon::current_num_threads(); let current_num_threads = rayon::current_num_threads();
@ -219,7 +223,7 @@ where
let (finished_steps, step_name) = steps::extract_facets(); let (finished_steps, step_name) = steps::extract_facets();
facet_field_ids_delta = merge_and_send_facet_docids( facet_field_ids_delta = merge_and_send_facet_docids(
FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?, FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, &extractor_sender.field_id_docid_facet_sender(), finished_steps, total_steps, step_name)?,
FacetDatabases::new(index), FacetDatabases::new(index),
index, index,
extractor_sender.facet_docids(), extractor_sender.facet_docids(),
@ -327,7 +331,15 @@ where
let (finished_steps, step_name) = steps::extract_word_proximity(); let (finished_steps, step_name) = steps::extract_word_proximity();
let caches = <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?; let caches = <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(grenad_parameters,
document_changes,
indexing_context,
&mut extractor_allocs,
finished_steps,
total_steps,
step_name,
)?;
merge_and_send_docids( merge_and_send_docids(
caches, caches,
index.word_pair_proximity_docids.remap_types(), index.word_pair_proximity_docids.remap_types(),
@ -341,35 +353,55 @@ where
let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors");
let _entered = span.enter(); let _entered = span.enter();
let index_embeddings = index.embedding_configs(&rtxn)?; let mut index_embeddings = index.embedding_configs(&rtxn)?;
if index_embeddings.is_empty() { if index_embeddings.is_empty() {
break 'vectors; break 'vectors;
} }
let embedding_sender = extractor_sender.embeddings(); let embedding_sender = extractor_sender.embeddings();
let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads());
let datastore = ThreadLocal::with_capacity(pool.current_num_threads()); let mut datastore = ThreadLocal::with_capacity(pool.current_num_threads());
let (finished_steps, step_name) = steps::extract_embeddings(); let (finished_steps, step_name) = steps::extract_embeddings();
extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?; extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?;
for config in &mut index_embeddings {
let mut user_provided = HashMap::new(); 'data: for data in datastore.iter_mut() {
for data in datastore { let data = &mut data.get_mut().0;
let data = data.into_inner().0; let Some(deladd) = data.remove(&config.name) else { continue 'data; };
for (embedder, deladd) in data.into_iter() { deladd.apply_to(&mut config.user_provided);
let user_provided = user_provided.entry(embedder).or_insert(Default::default());
if let Some(del) = deladd.del {
*user_provided -= del;
}
if let Some(add) = deladd.add {
*user_provided |= add;
}
} }
} }
embedding_sender.finish(user_provided).unwrap(); embedding_sender.finish(index_embeddings).unwrap();
}
'geo: {
let span = tracing::trace_span!(target: "indexing::documents::extract", "geo");
let _entered = span.enter();
// let geo_sender = extractor_sender.geo_points();
let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else {
break 'geo;
};
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
let (finished_steps, step_name) = steps::extract_geo_points();
extract(document_changes,
&extractor,
indexing_context,
&mut extractor_allocs,
&datastore,
finished_steps,
total_steps,
step_name,
)?;
merge_and_send_rtree(
datastore,
&rtxn,
index,
extractor_sender.geo(),
&indexing_context.must_stop_processing,
)?;
} }
// TODO THIS IS TOO MUCH // TODO THIS IS TOO MUCH
@ -471,7 +503,7 @@ where
writer.del_items(wtxn, *dimensions, docid)?; writer.del_items(wtxn, *dimensions, docid)?;
writer.add_item(wtxn, docid, &embedding)?; writer.add_item(wtxn, docid, &embedding)?;
} }
ArroyOperation::Finish { mut user_provided } => { ArroyOperation::Finish { configs } => {
let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build");
let _entered = span.enter(); let _entered = span.enter();
@ -496,14 +528,6 @@ where
)?; )?;
} }
let mut configs = index.embedding_configs(wtxn)?;
for config in &mut configs {
if let Some(user_provided) = user_provided.remove(&config.name) {
config.user_provided = user_provided;
}
}
index.put_embedding_configs(wtxn, configs)?; index.put_embedding_configs(wtxn, configs)?;
} }
}, },
@ -681,13 +705,11 @@ fn compute_facet_search_database(
} }
EitherOrBoth::Left(result) => { EitherOrBoth::Left(result) => {
let (key, _) = result?; let (key, _) = result?;
facet_search_builder facet_search_builder.register_from_key(DelAdd::Deletion, key)?;
.register_from_key(DelAdd::Deletion, key.left_bound.as_ref())?;
} }
EitherOrBoth::Right(result) => { EitherOrBoth::Right(result) => {
let (key, _) = result?; let (key, _) = result?;
facet_search_builder facet_search_builder.register_from_key(DelAdd::Addition, key)?;
.register_from_key(DelAdd::Addition, key.left_bound.as_ref())?;
} }
} }
} }
@ -735,7 +757,7 @@ pub fn retrieve_or_guess_primary_key<'a>(
index: &Index, index: &Index,
new_fields_ids_map: &mut FieldsIdsMap, new_fields_ids_map: &mut FieldsIdsMap,
primary_key_from_op: Option<&'a str>, primary_key_from_op: Option<&'a str>,
first_document: Option<&'a TopLevelMap<'a>>, first_document: Option<RawMap<'a>>,
) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> { ) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> {
// make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it.
@ -771,12 +793,18 @@ pub fn retrieve_or_guess_primary_key<'a>(
None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
}; };
let mut guesses: Vec<&str> = first_document let guesses: Result<Vec<&str>> = first_document
.keys() .keys()
.map(AsRef::as_ref) .filter_map(|name| {
.filter(|name| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) let Some(_) = new_fields_ids_map.insert(name) else {
return Some(Err(UserError::AttributeLimitReached.into()));
};
name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY).then_some(Ok(name))
})
.collect(); .collect();
let mut guesses = guesses?;
// sort the keys in lexicographical order, so that fields are always in the same order. // sort the keys in lexicographical order, so that fields are always in the same order.
guesses.sort_unstable(); guesses.sort_unstable();

View File

@ -93,7 +93,7 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
let DocumentChangeContext { let DocumentChangeContext {
index, index,
db_fields_ids_map, db_fields_ids_map,
txn, rtxn: txn,
new_fields_ids_map, new_fields_ids_map,
doc_alloc, doc_alloc,
.. ..

View File

@ -1,68 +1,63 @@
use std::io::{self}; use std::cell::RefCell;
use std::io;
use bincode::ErrorKind;
use hashbrown::HashSet; use hashbrown::HashSet;
use heed::types::Bytes; use heed::types::Bytes;
use heed::{Database, RoTxn}; use heed::{Database, RoTxn};
use memmap2::Mmap;
use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::iter::{IntoParallelIterator, ParallelIterator};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::channel::*; use super::channel::*;
use super::extract::{ use super::extract::{
merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind, merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind,
GeoExtractorData,
}; };
use super::DocumentChange; use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
use crate::{
CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, InternalError,
Result,
};
pub struct GeoExtractor { #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
rtree: Option<rstar::RTree<GeoPoint>>, pub fn merge_and_send_rtree<'extractor, MSP>(
} datastore: impl IntoIterator<Item = RefCell<GeoExtractorData<'extractor>>>,
rtxn: &RoTxn,
index: &Index,
geo_sender: GeoSender<'_>,
must_stop_processing: &MSP,
) -> Result<()>
where
MSP: Fn() -> bool + Sync,
{
let mut rtree = index.geo_rtree(rtxn)?.unwrap_or_default();
let mut faceted = index.geo_faceted_documents_ids(rtxn)?;
impl GeoExtractor { for data in datastore {
pub fn new(rtxn: &RoTxn, index: &Index) -> Result<Option<Self>> { if must_stop_processing() {
let is_sortable = index.sortable_fields(rtxn)?.contains("_geo"); return Err(InternalError::AbortedIndexation.into());
let is_filterable = index.filterable_fields(rtxn)?.contains("_geo"); }
if is_sortable || is_filterable {
Ok(Some(GeoExtractor { rtree: index.geo_rtree(rtxn)? })) let mut frozen = data.into_inner().freeze()?;
} else { for result in frozen.iter_and_clear_removed() {
Ok(None) let extracted_geo_point = result?;
debug_assert!(rtree.remove(&GeoPoint::from(extracted_geo_point)).is_some());
debug_assert!(faceted.remove(extracted_geo_point.docid));
}
for result in frozen.iter_and_clear_inserted() {
let extracted_geo_point = result?;
rtree.insert(GeoPoint::from(extracted_geo_point));
debug_assert!(faceted.insert(extracted_geo_point.docid));
} }
} }
pub fn manage_change( let mut file = tempfile::tempfile()?;
&mut self, /// manage error
fidmap: &mut GlobalFieldsIdsMap, bincode::serialize_into(&mut file, &rtree).unwrap();
change: &DocumentChange, file.sync_all()?;
) -> Result<()> {
match change {
DocumentChange::Deletion(_) => todo!(),
DocumentChange::Update(_) => todo!(),
DocumentChange::Insertion(_) => todo!(),
}
}
pub fn serialize_rtree<W: io::Write>(self, writer: &mut W) -> Result<bool> { let rtree_mmap = unsafe { Mmap::map(&file)? };
match self.rtree { geo_sender.set_rtree(rtree_mmap).unwrap();
Some(rtree) => { geo_sender.set_geo_faceted(&faceted).unwrap();
// TODO What should I do?
bincode::serialize_into(writer, &rtree).map(|_| true).map_err(|e| match *e { Ok(())
ErrorKind::Io(e) => Error::IoError(e),
ErrorKind::InvalidUtf8Encoding(_) => todo!(),
ErrorKind::InvalidBoolEncoding(_) => todo!(),
ErrorKind::InvalidCharEncoding => todo!(),
ErrorKind::InvalidTagEncoding(_) => todo!(),
ErrorKind::DeserializeAnyNotSupported => todo!(),
ErrorKind::SizeLimit => todo!(),
ErrorKind::SequenceMustHaveLength => todo!(),
ErrorKind::Custom(_) => todo!(),
})
}
None => Ok(false),
}
}
} }
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]

View File

@ -12,7 +12,7 @@ use super::indexer::de::DeserrRawValue;
use crate::documents::FieldIdMapper; use crate::documents::FieldIdMapper;
use crate::index::IndexEmbeddingConfig; use crate::index::IndexEmbeddingConfig;
use crate::vector::parsed_vectors::{ use crate::vector::parsed_vectors::{
RawVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, RawVectors, RawVectorsError, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME,
}; };
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs}; use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs};
use crate::{DocumentId, Index, InternalError, Result, UserError}; use crate::{DocumentId, Index, InternalError, Result, UserError};
@ -71,6 +71,7 @@ pub struct VectorEntry<'doc> {
pub has_configured_embedder: bool, pub has_configured_embedder: bool,
pub embeddings: Option<Embeddings<'doc>>, pub embeddings: Option<Embeddings<'doc>>,
pub regenerate: bool, pub regenerate: bool,
pub implicit: bool,
} }
pub trait VectorDocument<'doc> { pub trait VectorDocument<'doc> {
@ -125,6 +126,7 @@ impl<'t> VectorDocumentFromDb<'t> {
has_configured_embedder: true, has_configured_embedder: true,
embeddings: Some(Embeddings::FromDb(vectors)), embeddings: Some(Embeddings::FromDb(vectors)),
regenerate: !config.user_provided.contains(self.docid), regenerate: !config.user_provided.contains(self.docid),
implicit: false,
}) })
} }
} }
@ -141,7 +143,14 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
Ok((&*config_name, entry)) Ok((&*config_name, entry))
}) })
.chain(self.vectors_field.iter().flat_map(|map| map.iter()).map(|(name, value)| { .chain(self.vectors_field.iter().flat_map(|map| map.iter()).map(|(name, value)| {
Ok((name, entry_from_raw_value(value, false).map_err(InternalError::SerdeJson)?)) Ok((
name,
entry_from_raw_value(value, false).map_err(|_| {
InternalError::Serialization(crate::SerializationError::Decoding {
db_name: Some(crate::index::db_name::VECTOR_ARROY),
})
})?,
))
})) }))
} }
@ -153,43 +162,71 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
Some(self.entry_from_db(embedder_id, config)?) Some(self.entry_from_db(embedder_id, config)?)
} }
None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) { None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) {
Some(embedding_from_doc) => Some( Some(embedding_from_doc) => {
entry_from_raw_value(embedding_from_doc, false) Some(entry_from_raw_value(embedding_from_doc, false).map_err(|_| {
.map_err(InternalError::SerdeJson)?, InternalError::Serialization(crate::SerializationError::Decoding {
), db_name: Some(crate::index::db_name::VECTOR_ARROY),
})
})?)
}
None => None, None => None,
}, },
}) })
} }
} }
fn entry_from_raw_value_user<'doc>(
external_docid: &str,
embedder_name: &str,
value: &'doc RawValue,
has_configured_embedder: bool,
) -> Result<VectorEntry<'doc>> {
entry_from_raw_value(value, has_configured_embedder).map_err(|error| {
UserError::InvalidVectorsEmbedderConf {
document_id: external_docid.to_string(),
error: error.msg(embedder_name),
}
.into()
})
}
fn entry_from_raw_value( fn entry_from_raw_value(
value: &RawValue, value: &RawValue,
has_configured_embedder: bool, has_configured_embedder: bool,
) -> std::result::Result<VectorEntry<'_>, serde_json::Error> { ) -> std::result::Result<VectorEntry<'_>, RawVectorsError> {
let value: RawVectors = serde_json::from_str(value.get())?; let value: RawVectors = RawVectors::from_raw_value(value)?;
Ok(match value { Ok(match value {
RawVectors::Explicit(raw_explicit_vectors) => VectorEntry { RawVectors::Explicit(raw_explicit_vectors) => VectorEntry {
has_configured_embedder, has_configured_embedder,
embeddings: raw_explicit_vectors.embeddings.map(Embeddings::FromJsonExplicit), embeddings: raw_explicit_vectors.embeddings.map(Embeddings::FromJsonExplicit),
regenerate: raw_explicit_vectors.regenerate, regenerate: raw_explicit_vectors.regenerate,
implicit: false,
}, },
RawVectors::ImplicitlyUserProvided(value) => VectorEntry { RawVectors::ImplicitlyUserProvided(value) => VectorEntry {
has_configured_embedder, has_configured_embedder,
embeddings: Some(Embeddings::FromJsonImplicityUserProvided(value)), // implicitly user provided always provide embeddings
// `None` here means that there are no embeddings
embeddings: Some(
value
.map(Embeddings::FromJsonImplicityUserProvided)
.unwrap_or(Embeddings::FromDb(Default::default())),
),
regenerate: false, regenerate: false,
implicit: true,
}, },
}) })
} }
pub struct VectorDocumentFromVersions<'doc> { pub struct VectorDocumentFromVersions<'doc> {
external_document_id: &'doc str,
vectors: RawMap<'doc>, vectors: RawMap<'doc>,
embedders: &'doc EmbeddingConfigs, embedders: &'doc EmbeddingConfigs,
} }
impl<'doc> VectorDocumentFromVersions<'doc> { impl<'doc> VectorDocumentFromVersions<'doc> {
pub fn new( pub fn new(
external_document_id: &'doc str,
versions: &Versions<'doc>, versions: &Versions<'doc>,
bump: &'doc Bump, bump: &'doc Bump,
embedders: &'doc EmbeddingConfigs, embedders: &'doc EmbeddingConfigs,
@ -198,7 +235,7 @@ impl<'doc> VectorDocumentFromVersions<'doc> {
if let Some(vectors_field) = document.vectors_field()? { if let Some(vectors_field) = document.vectors_field()? {
let vectors = let vectors =
RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?;
Ok(Some(Self { vectors, embedders })) Ok(Some(Self { external_document_id, vectors, embedders }))
} else { } else {
Ok(None) Ok(None)
} }
@ -208,16 +245,24 @@ impl<'doc> VectorDocumentFromVersions<'doc> {
impl<'doc> VectorDocument<'doc> for VectorDocumentFromVersions<'doc> { impl<'doc> VectorDocument<'doc> for VectorDocumentFromVersions<'doc> {
fn iter_vectors(&self) -> impl Iterator<Item = Result<(&'doc str, VectorEntry<'doc>)>> { fn iter_vectors(&self) -> impl Iterator<Item = Result<(&'doc str, VectorEntry<'doc>)>> {
self.vectors.iter().map(|(embedder, vectors)| { self.vectors.iter().map(|(embedder, vectors)| {
let vectors = entry_from_raw_value(vectors, self.embedders.contains(embedder)) let vectors = entry_from_raw_value_user(
.map_err(UserError::SerdeJson)?; self.external_document_id,
embedder,
vectors,
self.embedders.contains(embedder),
)?;
Ok((embedder, vectors)) Ok((embedder, vectors))
}) })
} }
fn vectors_for_key(&self, key: &str) -> Result<Option<VectorEntry<'doc>>> { fn vectors_for_key(&self, key: &str) -> Result<Option<VectorEntry<'doc>>> {
let Some(vectors) = self.vectors.get(key) else { return Ok(None) }; let Some(vectors) = self.vectors.get(key) else { return Ok(None) };
let vectors = entry_from_raw_value(vectors, self.embedders.contains(key)) let vectors = entry_from_raw_value_user(
.map_err(UserError::SerdeJson)?; self.external_document_id,
key,
vectors,
self.embedders.contains(key),
)?;
Ok(Some(vectors)) Ok(Some(vectors))
} }
} }
@ -228,8 +273,10 @@ pub struct MergedVectorDocument<'doc> {
} }
impl<'doc> MergedVectorDocument<'doc> { impl<'doc> MergedVectorDocument<'doc> {
#[allow(clippy::too_many_arguments)]
pub fn with_db<Mapper: FieldIdMapper>( pub fn with_db<Mapper: FieldIdMapper>(
docid: DocumentId, docid: DocumentId,
external_document_id: &'doc str,
index: &'doc Index, index: &'doc Index,
rtxn: &'doc RoTxn, rtxn: &'doc RoTxn,
db_fields_ids_map: &'doc Mapper, db_fields_ids_map: &'doc Mapper,
@ -238,16 +285,20 @@ impl<'doc> MergedVectorDocument<'doc> {
embedders: &'doc EmbeddingConfigs, embedders: &'doc EmbeddingConfigs,
) -> Result<Option<Self>> { ) -> Result<Option<Self>> {
let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?; let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?;
let new_doc = VectorDocumentFromVersions::new(versions, doc_alloc, embedders)?; let new_doc =
VectorDocumentFromVersions::new(&external_document_id, versions, doc_alloc, embedders)?;
Ok(if db.is_none() && new_doc.is_none() { None } else { Some(Self { new_doc, db }) }) Ok(if db.is_none() && new_doc.is_none() { None } else { Some(Self { new_doc, db }) })
} }
pub fn without_db( pub fn without_db(
external_document_id: &'doc str,
versions: &Versions<'doc>, versions: &Versions<'doc>,
doc_alloc: &'doc Bump, doc_alloc: &'doc Bump,
embedders: &'doc EmbeddingConfigs, embedders: &'doc EmbeddingConfigs,
) -> Result<Option<Self>> { ) -> Result<Option<Self>> {
let Some(new_doc) = VectorDocumentFromVersions::new(versions, doc_alloc, embedders)? else { let Some(new_doc) =
VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)?
else {
return Ok(None); return Ok(None);
}; };
Ok(Some(Self { new_doc: Some(new_doc), db: None })) Ok(Some(Self { new_doc: Some(new_doc), db: None }))

View File

@ -183,14 +183,17 @@ impl Embedder {
let token_ids = if token_ids.len() > 512 { &token_ids[..512] } else { token_ids }; let token_ids = if token_ids.len() > 512 { &token_ids[..512] } else { token_ids };
let token_ids = let token_ids =
Tensor::new(token_ids, &self.model.device).map_err(EmbedError::tensor_shape)?; Tensor::new(token_ids, &self.model.device).map_err(EmbedError::tensor_shape)?;
let token_ids = Tensor::stack(&[token_ids], 0).map_err(EmbedError::tensor_shape)?;
let token_type_ids = token_ids.zeros_like().map_err(EmbedError::tensor_shape)?; let token_type_ids = token_ids.zeros_like().map_err(EmbedError::tensor_shape)?;
let embeddings = let embeddings =
self.model.forward(&token_ids, &token_type_ids).map_err(EmbedError::model_forward)?; self.model.forward(&token_ids, &token_type_ids).map_err(EmbedError::model_forward)?;
// Apply some avg-pooling by taking the mean embedding value for all tokens (including padding) // Apply some avg-pooling by taking the mean embedding value for all tokens (including padding)
let (n_tokens, _hidden_size) = embeddings.dims2().map_err(EmbedError::tensor_shape)?; let (_n_sentence, n_tokens, _hidden_size) =
let embedding = (embeddings.sum(0).map_err(EmbedError::tensor_value)? / (n_tokens as f64)) embeddings.dims3().map_err(EmbedError::tensor_shape)?;
let embedding = (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64))
.map_err(EmbedError::tensor_shape)?; .map_err(EmbedError::tensor_shape)?;
let embedding = embedding.squeeze(0).map_err(EmbedError::tensor_shape)?;
let embedding: Embedding = embedding.to_vec1().map_err(EmbedError::tensor_shape)?; let embedding: Embedding = embedding.to_vec1().map_err(EmbedError::tensor_shape)?;
Ok(embedding) Ok(embedding)
} }

View File

@ -648,7 +648,7 @@ impl Embedder {
Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(), Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(),
Embedder::OpenAi(embedder) => embedder.chunk_count_hint(), Embedder::OpenAi(embedder) => embedder.chunk_count_hint(),
Embedder::Ollama(embedder) => embedder.chunk_count_hint(), Embedder::Ollama(embedder) => embedder.chunk_count_hint(),
Embedder::UserProvided(_) => 1, Embedder::UserProvided(_) => 100,
Embedder::Rest(embedder) => embedder.chunk_count_hint(), Embedder::Rest(embedder) => embedder.chunk_count_hint(),
} }
} }

View File

@ -113,7 +113,7 @@ impl Embedder {
threads threads
.install(move || { .install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.chunk_count_hint()) .par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk)) .map(move |chunk| self.embed(chunk))
.collect(); .collect();

View File

@ -266,7 +266,7 @@ impl Embedder {
threads threads
.install(move || { .install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.chunk_count_hint()) .par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk)) .map(move |chunk| self.embed(chunk))
.collect(); .collect();

View File

@ -12,11 +12,248 @@ use crate::{DocumentId, FieldId, InternalError, UserError};
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
#[derive(serde::Serialize, serde::Deserialize, Debug)] #[derive(serde::Serialize, Debug)]
#[serde(untagged)] #[serde(untagged)]
pub enum RawVectors<'doc> { pub enum RawVectors<'doc> {
Explicit(#[serde(borrow)] RawExplicitVectors<'doc>), Explicit(#[serde(borrow)] RawExplicitVectors<'doc>),
ImplicitlyUserProvided(#[serde(borrow)] &'doc RawValue), ImplicitlyUserProvided(#[serde(borrow)] Option<&'doc RawValue>),
}
pub enum RawVectorsError {
DeserializeSeq { index: usize, error: String },
DeserializeKey { error: String },
DeserializeRegenerate { error: String },
DeserializeEmbeddings { error: String },
UnknownField { field: String },
MissingRegenerate,
WrongKind { kind: &'static str, value: String },
Parsing(serde_json::Error),
}
impl RawVectorsError {
pub fn msg(self, embedder_name: &str) -> String {
match self {
RawVectorsError::DeserializeSeq { index, error } => format!(
"Could not parse `._vectors.{embedder_name}[{index}]`: {error}"
),
RawVectorsError::DeserializeKey { error } => format!(
"Could not parse a field at `._vectors.{embedder_name}`: {error}"
),
RawVectorsError::DeserializeRegenerate { error } => format!(
"Could not parse `._vectors.{embedder_name}.regenerate`: {error}"
),
RawVectorsError::DeserializeEmbeddings { error } => format!(
"Could not parse `._vectors.{embedder_name}.embeddings`: {error}"
),
RawVectorsError::UnknownField { field } => format!(
"Unexpected field `._vectors.{embedder_name}.{field}`\n \
- note: the allowed fields are `regenerate` and `embeddings`"
),
RawVectorsError::MissingRegenerate => format!(
"Missing field `._vectors.{embedder_name}.regenerate`\n \
- note: `._vectors.{embedder_name}` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`"
),
RawVectorsError::WrongKind { kind, value } => format!(
"Expected `._vectors.{embedder_name}` to be an array of floats, an array of arrays of floats, or an object with at least the field `regenerate`, but got the {kind} `{value}`"
),
RawVectorsError::Parsing(error) => format!(
"Could not parse `._vectors.{embedder_name}`: {error}"
),
}
}
}
impl<'doc> RawVectors<'doc> {
pub fn from_raw_value(raw: &'doc RawValue) -> Result<Self, RawVectorsError> {
use serde::de::Deserializer as _;
Ok(match raw.deserialize_any(RawVectorsVisitor).map_err(RawVectorsError::Parsing)?? {
RawVectorsVisitorValue::ImplicitNone => RawVectors::ImplicitlyUserProvided(None),
RawVectorsVisitorValue::Implicit => RawVectors::ImplicitlyUserProvided(Some(raw)),
RawVectorsVisitorValue::Explicit { regenerate, embeddings } => {
RawVectors::Explicit(RawExplicitVectors { embeddings, regenerate })
}
})
}
}
struct RawVectorsVisitor;
enum RawVectorsVisitorValue<'doc> {
ImplicitNone,
Implicit,
Explicit { regenerate: bool, embeddings: Option<&'doc RawValue> },
}
impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor {
type Value = std::result::Result<RawVectorsVisitorValue<'doc>, RawVectorsError>;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(formatter, "a map containing at least `regenerate`, or an array of floats`")
}
fn visit_none<E>(self) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Ok(RawVectorsVisitorValue::ImplicitNone))
}
fn visit_some<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
where
D: serde::Deserializer<'doc>,
{
deserializer.deserialize_any(self)
}
fn visit_unit<E>(self) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Ok(RawVectorsVisitorValue::ImplicitNone))
}
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'doc>,
{
let mut index = 0;
// must consume all elements or parsing fails
loop {
match seq.next_element::<&RawValue>() {
Ok(Some(_)) => index += 1,
Err(error) => {
return Ok(Err(RawVectorsError::DeserializeSeq {
index,
error: error.to_string(),
}))
}
Ok(None) => break,
};
}
Ok(Ok(RawVectorsVisitorValue::Implicit))
}
fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
where
A: serde::de::MapAccess<'doc>,
{
let mut regenerate = None;
let mut embeddings = None;
loop {
match map.next_key::<&str>() {
Ok(Some("regenerate")) => {
let value: bool = match map.next_value() {
Ok(value) => value,
Err(error) => {
return Ok(Err(RawVectorsError::DeserializeRegenerate {
error: error.to_string(),
}))
}
};
regenerate = Some(value);
}
Ok(Some("embeddings")) => {
let value: &RawValue = match map.next_value() {
Ok(value) => value,
Err(error) => {
return Ok(Err(RawVectorsError::DeserializeEmbeddings {
error: error.to_string(),
}))
}
};
embeddings = Some(value);
}
Ok(Some(other)) => {
return Ok(Err(RawVectorsError::UnknownField { field: other.to_string() }))
}
Ok(None) => break,
Err(error) => {
return Ok(Err(RawVectorsError::DeserializeKey { error: error.to_string() }))
}
}
}
let Some(regenerate) = regenerate else {
return Ok(Err(RawVectorsError::MissingRegenerate));
};
Ok(Ok(RawVectorsVisitorValue::Explicit { regenerate, embeddings }))
}
fn visit_bool<E>(self, v: bool) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Err(RawVectorsError::WrongKind { kind: "boolean", value: v.to_string() }))
}
fn visit_i64<E>(self, v: i64) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Err(RawVectorsError::WrongKind { kind: "integer", value: v.to_string() }))
}
fn visit_i128<E>(self, v: i128) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Err(RawVectorsError::WrongKind { kind: "integer", value: v.to_string() }))
}
fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Err(RawVectorsError::WrongKind { kind: "integer", value: v.to_string() }))
}
fn visit_u128<E>(self, v: u128) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Err(RawVectorsError::WrongKind { kind: "integer", value: v.to_string() }))
}
fn visit_f64<E>(self, v: f64) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Err(RawVectorsError::WrongKind { kind: "number", value: v.to_string() }))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Err(RawVectorsError::WrongKind { kind: "string", value: v.to_string() }))
}
fn visit_string<E>(self, v: String) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Err(RawVectorsError::WrongKind { kind: "string", value: v }))
}
fn visit_bytes<E>(self, v: &[u8]) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Err(RawVectorsError::WrongKind { kind: "bytes", value: format!("{v:?}") }))
}
fn visit_newtype_struct<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
where
D: serde::Deserializer<'doc>,
{
deserializer.deserialize_any(self)
}
fn visit_enum<A>(self, _data: A) -> Result<Self::Value, A::Error>
where
A: serde::de::EnumAccess<'doc>,
{
Ok(Err(RawVectorsError::WrongKind { kind: "enum", value: "a variant".to_string() }))
}
} }
#[derive(serde::Serialize, Debug)] #[derive(serde::Serialize, Debug)]
@ -86,7 +323,7 @@ impl<'doc> RawVectors<'doc> {
} }
pub fn embeddings(&self) -> Option<&'doc RawValue> { pub fn embeddings(&self) -> Option<&'doc RawValue> {
match self { match self {
RawVectors::ImplicitlyUserProvided(embeddings) => Some(embeddings), RawVectors::ImplicitlyUserProvided(embeddings) => *embeddings,
RawVectors::Explicit(RawExplicitVectors { embeddings, regenerate: _ }) => *embeddings, RawVectors::Explicit(RawExplicitVectors { embeddings, regenerate: _ }) => *embeddings,
} }
} }
@ -258,7 +495,7 @@ impl Error {
Error::InvalidEmbedderConf { error } => { Error::InvalidEmbedderConf { error } => {
crate::Error::UserError(UserError::InvalidVectorsEmbedderConf { crate::Error::UserError(UserError::InvalidVectorsEmbedderConf {
document_id, document_id,
error, error: error.to_string(),
}) })
} }
Error::InternalSerdeJson(error) => { Error::InternalSerdeJson(error) => {

View File

@ -193,7 +193,7 @@ impl Embedder {
threads threads
.install(move || { .install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.chunk_count_hint()) .par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed_ref(chunk)) .map(move |chunk| self.embed_ref(chunk))
.collect(); .collect();