mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 02:27:40 +08:00
Compare commits
13 Commits
ab2c83f868
...
323ecbb885
Author | SHA1 | Date | |
---|---|---|---|
|
323ecbb885 | ||
|
ffb60cb885 | ||
|
dcc3caef0d | ||
|
221e547e86 | ||
|
61d0615253 | ||
|
5727e00374 | ||
|
9b60843831 | ||
|
36962b943b | ||
|
32bcacefd5 | ||
|
4ed195426c | ||
|
ff38f29981 | ||
|
94b260fd25 | ||
|
03ab6b39e7 |
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -4483,7 +4483,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "raw-collections"
|
name = "raw-collections"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/dureuill/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a"
|
source = "git+https://github.com/meilisearch/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"allocator-api2",
|
"allocator-api2",
|
||||||
"bitpacking",
|
"bitpacking",
|
||||||
|
@ -22,7 +22,7 @@ flate2 = "1.0.30"
|
|||||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||||
meilisearch-types = { path = "../meilisearch-types" }
|
meilisearch-types = { path = "../meilisearch-types" }
|
||||||
page_size = "0.6.0"
|
page_size = "0.6.0"
|
||||||
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
|
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
|
||||||
rayon = "1.10.0"
|
rayon = "1.10.0"
|
||||||
roaring = { version = "0.10.6", features = ["serde"] }
|
roaring = { version = "0.10.6", features = ["serde"] }
|
||||||
serde = { version = "1.0.204", features = ["derive"] }
|
serde = { version = "1.0.204", features = ["derive"] }
|
||||||
|
@ -1239,7 +1239,7 @@ impl IndexScheduler {
|
|||||||
|
|
||||||
let started_processing_at = std::time::Instant::now();
|
let started_processing_at = std::time::Instant::now();
|
||||||
let secs_since_started_processing_at = AtomicU64::new(0);
|
let secs_since_started_processing_at = AtomicU64::new(0);
|
||||||
const PRINT_SECS_DELTA: u64 = 1;
|
const PRINT_SECS_DELTA: u64 = 5;
|
||||||
|
|
||||||
let processing_tasks = self.processing_tasks.clone();
|
let processing_tasks = self.processing_tasks.clone();
|
||||||
let must_stop_processing = self.must_stop_processing.clone();
|
let must_stop_processing = self.must_stop_processing.clone();
|
||||||
@ -1329,7 +1329,6 @@ impl IndexScheduler {
|
|||||||
DocumentOperation::Add(_content_uuid) => {
|
DocumentOperation::Add(_content_uuid) => {
|
||||||
let mmap = content_files_iter.next().unwrap();
|
let mmap = content_files_iter.next().unwrap();
|
||||||
indexer.add_documents(mmap)?;
|
indexer.add_documents(mmap)?;
|
||||||
// builder = builder.with_embedders(embedders.clone());
|
|
||||||
}
|
}
|
||||||
DocumentOperation::Delete(document_ids) => {
|
DocumentOperation::Delete(document_ids) => {
|
||||||
let document_ids: bumpalo::collections::vec::Vec<_> = document_ids
|
let document_ids: bumpalo::collections::vec::Vec<_> = document_ids
|
||||||
@ -1412,17 +1411,6 @@ impl IndexScheduler {
|
|||||||
|
|
||||||
tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||||
}
|
}
|
||||||
// else if primary_key_has_been_set {
|
|
||||||
// // Everything failed but we've set a primary key.
|
|
||||||
// // We need to remove it.
|
|
||||||
// let mut builder =
|
|
||||||
// milli::update::Settings::new(index_wtxn, index, indexer_config);
|
|
||||||
// builder.reset_primary_key();
|
|
||||||
// builder.execute(
|
|
||||||
// |indexing_step| tracing::trace!(update = ?indexing_step),
|
|
||||||
// || must_stop_processing.clone().get(),
|
|
||||||
// )?;
|
|
||||||
// }
|
|
||||||
|
|
||||||
Ok(tasks)
|
Ok(tasks)
|
||||||
}
|
}
|
||||||
|
@ -1365,6 +1365,7 @@ impl IndexScheduler {
|
|||||||
let ProcessingTasks { batch, processing, progress } =
|
let ProcessingTasks { batch, processing, progress } =
|
||||||
self.processing_tasks.read().map_err(|_| Error::CorruptedTaskQueue)?.clone();
|
self.processing_tasks.read().map_err(|_| Error::CorruptedTaskQueue)?.clone();
|
||||||
|
|
||||||
|
// ignored for now, might be added to batch details later
|
||||||
let _ = progress;
|
let _ = progress;
|
||||||
|
|
||||||
let ret = tasks.into_iter();
|
let ret = tasks.into_iter();
|
||||||
@ -5198,11 +5199,9 @@ mod tests {
|
|||||||
handle.advance_one_successful_batch();
|
handle.advance_one_successful_batch();
|
||||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed");
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed");
|
||||||
|
|
||||||
// The second batch should fail.
|
|
||||||
handle.advance_one_successful_batch();
|
handle.advance_one_successful_batch();
|
||||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails");
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails");
|
||||||
|
|
||||||
// The second batch should fail.
|
|
||||||
handle.advance_one_successful_batch();
|
handle.advance_one_successful_batch();
|
||||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_fails");
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_fails");
|
||||||
|
|
||||||
@ -5263,7 +5262,6 @@ mod tests {
|
|||||||
handle.advance_one_successful_batch();
|
handle.advance_one_successful_batch();
|
||||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed");
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed");
|
||||||
|
|
||||||
// The second batch should fail and contains two tasks.
|
|
||||||
handle.advance_one_successful_batch();
|
handle.advance_one_successful_batch();
|
||||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_and_third_tasks_fails");
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_and_third_tasks_fails");
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@ edition.workspace = true
|
|||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
# fixed version due to format breakages in v1.40
|
||||||
insta = { version = "=1.39.0", features = ["json", "redactions"] }
|
insta = { version = "=1.39.0", features = ["json", "redactions"] }
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
once_cell = "1.19"
|
once_cell = "1.19"
|
||||||
|
@ -24,7 +24,7 @@ flate2 = "1.0.30"
|
|||||||
fst = "0.4.7"
|
fst = "0.4.7"
|
||||||
memmap2 = "0.9.4"
|
memmap2 = "0.9.4"
|
||||||
milli = { path = "../milli" }
|
milli = { path = "../milli" }
|
||||||
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
|
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
|
||||||
roaring = { version = "0.10.6", features = ["serde"] }
|
roaring = { version = "0.10.6", features = ["serde"] }
|
||||||
serde = { version = "1.0.204", features = ["derive"] }
|
serde = { version = "1.0.204", features = ["derive"] }
|
||||||
serde-cs = "0.2.4"
|
serde-cs = "0.2.4"
|
||||||
|
@ -6,7 +6,6 @@ use std::marker::PhantomData;
|
|||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
use milli::documents::Error;
|
use milli::documents::Error;
|
||||||
use milli::update::new::TopLevelMap;
|
|
||||||
use milli::Object;
|
use milli::Object;
|
||||||
use raw_collections::RawMap;
|
use raw_collections::RawMap;
|
||||||
use serde::de::{SeqAccess, Visitor};
|
use serde::de::{SeqAccess, Visitor};
|
||||||
@ -128,7 +127,6 @@ impl ErrorCode for DocumentFormatError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO remove that from the place I've borrowed it
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
enum AllowedType {
|
enum AllowedType {
|
||||||
String,
|
String,
|
||||||
@ -213,7 +211,7 @@ pub fn read_csv(input: &File, output: impl io::Write, delimiter: u8) -> Result<u
|
|||||||
|
|
||||||
/// Reads JSON from file and write it in NDJSON in a file checking it along the way.
|
/// Reads JSON from file and write it in NDJSON in a file checking it along the way.
|
||||||
pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
|
pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
|
||||||
// We memory map to be able to deserailize into a TopLevelMap<'pl> that
|
// We memory map to be able to deserialize into a RawMap that
|
||||||
// does not allocate when possible and only materialize the first/top level.
|
// does not allocate when possible and only materialize the first/top level.
|
||||||
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
|
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
|
||||||
let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB
|
let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB
|
||||||
@ -254,16 +252,23 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
|
|||||||
|
|
||||||
/// Reads NDJSON from file and write it in NDJSON in a file checking it along the way.
|
/// Reads NDJSON from file and write it in NDJSON in a file checking it along the way.
|
||||||
pub fn read_ndjson(input: &File, output: impl io::Write) -> Result<u64> {
|
pub fn read_ndjson(input: &File, output: impl io::Write) -> Result<u64> {
|
||||||
// We memory map to be able to deserailize into a TopLevelMap<'pl> that
|
// We memory map to be able to deserialize into a RawMap that
|
||||||
// does not allocate when possible and only materialize the first/top level.
|
// does not allocate when possible and only materialize the first/top level.
|
||||||
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
|
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
|
||||||
let mut output = BufWriter::new(output);
|
let mut output = BufWriter::new(output);
|
||||||
|
|
||||||
|
let mut bump = Bump::with_capacity(1024 * 1024);
|
||||||
|
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
for result in serde_json::Deserializer::from_slice(&input).into_iter() {
|
for result in serde_json::Deserializer::from_slice(&input).into_iter() {
|
||||||
|
bump.reset();
|
||||||
count += 1;
|
count += 1;
|
||||||
result
|
result
|
||||||
.and_then(|map: TopLevelMap| to_writer(&mut output, &map))
|
.and_then(|raw: &RawValue| {
|
||||||
|
// try to deserialize as a map
|
||||||
|
let map = RawMap::from_raw_value(raw, &bump)?;
|
||||||
|
to_writer(&mut output, &map)
|
||||||
|
})
|
||||||
.map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?;
|
.map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -172,7 +172,7 @@ async fn create_mock_with_template(
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 3. check API key
|
// 2. check API key
|
||||||
match req.headers.get("Authorization") {
|
match req.headers.get("Authorization") {
|
||||||
Some(api_key) if api_key == API_KEY_BEARER => {
|
Some(api_key) if api_key == API_KEY_BEARER => {
|
||||||
{}
|
{}
|
||||||
|
@ -95,7 +95,7 @@ ureq = { version = "2.10.0", features = ["json"] }
|
|||||||
url = "2.5.2"
|
url = "2.5.2"
|
||||||
rayon-par-bridge = "0.1.0"
|
rayon-par-bridge = "0.1.0"
|
||||||
hashbrown = "0.15.0"
|
hashbrown = "0.15.0"
|
||||||
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
|
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
|
||||||
bumpalo = "3.16.0"
|
bumpalo = "3.16.0"
|
||||||
thread_local = "1.1.8"
|
thread_local = "1.1.8"
|
||||||
allocator-api2 = "0.2.18"
|
allocator-api2 = "0.2.18"
|
||||||
|
@ -2,7 +2,7 @@ use std::io::{self, Write};
|
|||||||
|
|
||||||
use grenad::{CompressionType, WriterBuilder};
|
use grenad::{CompressionType, WriterBuilder};
|
||||||
use serde::de::Deserializer;
|
use serde::de::Deserializer;
|
||||||
use serde_json::{to_writer, Value};
|
use serde_json::to_writer;
|
||||||
|
|
||||||
use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
|
use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
|
||||||
use crate::documents::serde_impl::DocumentVisitor;
|
use crate::documents::serde_impl::DocumentVisitor;
|
||||||
@ -87,95 +87,6 @@ impl<W: Write> DocumentsBatchBuilder<W> {
|
|||||||
de.deserialize_any(&mut visitor)?
|
de.deserialize_any(&mut visitor)?
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly.
|
|
||||||
pub fn append_csv<R: io::Read>(&mut self, mut reader: csv::Reader<R>) -> Result<(), Error> {
|
|
||||||
// Make sure that we insert the fields ids in order as the obkv writer has this requirement.
|
|
||||||
let mut typed_fields_ids: Vec<_> = reader
|
|
||||||
.headers()?
|
|
||||||
.into_iter()
|
|
||||||
.map(parse_csv_header)
|
|
||||||
.map(|(k, t)| (self.fields_index.insert(k), t))
|
|
||||||
.enumerate()
|
|
||||||
.collect();
|
|
||||||
// Make sure that we insert the fields ids in order as the obkv writer has this requirement.
|
|
||||||
typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid);
|
|
||||||
|
|
||||||
let mut record = csv::StringRecord::new();
|
|
||||||
let mut line = 0;
|
|
||||||
while reader.read_record(&mut record)? {
|
|
||||||
// We increment here and not at the end of the while loop to take
|
|
||||||
// the header offset into account.
|
|
||||||
line += 1;
|
|
||||||
|
|
||||||
self.obkv_buffer.clear();
|
|
||||||
let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer);
|
|
||||||
|
|
||||||
for (i, (field_id, type_)) in typed_fields_ids.iter() {
|
|
||||||
self.value_buffer.clear();
|
|
||||||
|
|
||||||
let value = &record[*i];
|
|
||||||
let trimmed_value = value.trim();
|
|
||||||
match type_ {
|
|
||||||
AllowedType::Number => {
|
|
||||||
if trimmed_value.is_empty() {
|
|
||||||
to_writer(&mut self.value_buffer, &Value::Null)?;
|
|
||||||
} else if let Ok(integer) = trimmed_value.parse::<i64>() {
|
|
||||||
to_writer(&mut self.value_buffer, &integer)?;
|
|
||||||
} else {
|
|
||||||
match trimmed_value.parse::<f64>() {
|
|
||||||
Ok(float) => {
|
|
||||||
to_writer(&mut self.value_buffer, &float)?;
|
|
||||||
}
|
|
||||||
Err(error) => {
|
|
||||||
return Err(Error::ParseFloat {
|
|
||||||
error,
|
|
||||||
line,
|
|
||||||
value: value.to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
AllowedType::Boolean => {
|
|
||||||
if trimmed_value.is_empty() {
|
|
||||||
to_writer(&mut self.value_buffer, &Value::Null)?;
|
|
||||||
} else {
|
|
||||||
match trimmed_value.parse::<bool>() {
|
|
||||||
Ok(bool) => {
|
|
||||||
to_writer(&mut self.value_buffer, &bool)?;
|
|
||||||
}
|
|
||||||
Err(error) => {
|
|
||||||
return Err(Error::ParseBool {
|
|
||||||
error,
|
|
||||||
line,
|
|
||||||
value: value.to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
AllowedType::String => {
|
|
||||||
if value.is_empty() {
|
|
||||||
to_writer(&mut self.value_buffer, &Value::Null)?;
|
|
||||||
} else {
|
|
||||||
to_writer(&mut self.value_buffer, value)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We insert into the obkv writer the value buffer that has been filled just above.
|
|
||||||
writer.insert(*field_id, &self.value_buffer)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let internal_id = self.documents_count.to_be_bytes();
|
|
||||||
let document_bytes = writer.into_inner()?;
|
|
||||||
self.writer.insert(internal_id, &document_bytes)?;
|
|
||||||
self.documents_count += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`.
|
/// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`.
|
||||||
pub fn into_inner(mut self) -> io::Result<W> {
|
pub fn into_inner(mut self) -> io::Result<W> {
|
||||||
let DocumentsBatchBuilder { mut writer, fields_index, .. } = self;
|
let DocumentsBatchBuilder { mut writer, fields_index, .. } = self;
|
||||||
@ -189,35 +100,12 @@ impl<W: Write> DocumentsBatchBuilder<W> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
enum AllowedType {
|
|
||||||
String,
|
|
||||||
Boolean,
|
|
||||||
Number,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_csv_header(header: &str) -> (&str, AllowedType) {
|
|
||||||
// if there are several separators we only split on the last one.
|
|
||||||
match header.rsplit_once(':') {
|
|
||||||
Some((field_name, field_type)) => match field_type {
|
|
||||||
"string" => (field_name, AllowedType::String),
|
|
||||||
"boolean" => (field_name, AllowedType::Boolean),
|
|
||||||
"number" => (field_name, AllowedType::Number),
|
|
||||||
// if the pattern isn't recognized, we keep the whole field.
|
|
||||||
_otherwise => (header, AllowedType::String),
|
|
||||||
},
|
|
||||||
None => (header, AllowedType::String),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
use crate::documents::DocumentsBatchReader;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn add_single_documents_json() {
|
fn add_single_documents_json() {
|
||||||
@ -253,348 +141,4 @@ mod test {
|
|||||||
|
|
||||||
assert!(cursor.next_document().unwrap().is_none());
|
assert!(cursor.next_document().unwrap().is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn add_documents_csv() {
|
|
||||||
let csv_content = "id:number,field:string\n1,hello!\n2,blabla";
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
assert_eq!(builder.documents_count(), 2);
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
assert_eq!(index.len(), 2);
|
|
||||||
|
|
||||||
let document = cursor.next_document().unwrap().unwrap();
|
|
||||||
assert_eq!(document.iter().count(), 2);
|
|
||||||
|
|
||||||
let document = cursor.next_document().unwrap().unwrap();
|
|
||||||
assert_eq!(document.iter().count(), 2);
|
|
||||||
|
|
||||||
assert!(cursor.next_document().unwrap().is_none());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn simple_csv_document() {
|
|
||||||
let csv_content = r#"city,country,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
val,
|
|
||||||
json!({
|
|
||||||
"city": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
|
|
||||||
assert!(cursor.next_document().unwrap().is_none());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn coma_in_field() {
|
|
||||||
let csv_content = r#"city,country,pop
|
|
||||||
"Boston","United, States","4628910""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
val,
|
|
||||||
json!({
|
|
||||||
"city": "Boston",
|
|
||||||
"country": "United, States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn quote_in_field() {
|
|
||||||
let csv_content = r#"city,country,pop
|
|
||||||
"Boston","United"" States","4628910""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
val,
|
|
||||||
json!({
|
|
||||||
"city": "Boston",
|
|
||||||
"country": "United\" States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn integer_in_field() {
|
|
||||||
let csv_content = r#"city,country,pop:number
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
val,
|
|
||||||
json!({
|
|
||||||
"city": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": 4628910,
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn integer_as_id() {
|
|
||||||
let csv_content = r#""id:number","title:string","comment:string"
|
|
||||||
"1239","Pride and Prejudice","A great book""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
val,
|
|
||||||
json!({
|
|
||||||
"id": 1239,
|
|
||||||
"title": "Pride and Prejudice",
|
|
||||||
"comment": "A great book",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn float_in_field() {
|
|
||||||
let csv_content = r#"city,country,pop:number
|
|
||||||
"Boston","United States","4628910.01""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
val,
|
|
||||||
json!({
|
|
||||||
"city": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": 4628910.01,
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn several_colon_in_header() {
|
|
||||||
let csv_content = r#"city:love:string,country:state,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
val,
|
|
||||||
json!({
|
|
||||||
"city:love": "Boston",
|
|
||||||
"country:state": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn ending_by_colon_in_header() {
|
|
||||||
let csv_content = r#"city:,country,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
val,
|
|
||||||
json!({
|
|
||||||
"city:": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn starting_by_colon_in_header() {
|
|
||||||
let csv_content = r#":city,country,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
val,
|
|
||||||
json!({
|
|
||||||
":city": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[ignore]
|
|
||||||
#[test]
|
|
||||||
fn starting_by_colon_in_header2() {
|
|
||||||
let csv_content = r#":string,country,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
|
|
||||||
assert!(cursor.next_document().is_err());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn double_colon_in_header() {
|
|
||||||
let csv_content = r#"city::string,country,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
|
|
||||||
.unwrap()
|
|
||||||
.into_cursor_and_fields_index();
|
|
||||||
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
val,
|
|
||||||
json!({
|
|
||||||
"city:": "Boston",
|
|
||||||
"country": "United States",
|
|
||||||
"pop": "4628910",
|
|
||||||
})
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn bad_type_in_header() {
|
|
||||||
let csv_content = r#"city,country:number,pop
|
|
||||||
"Boston","United States","4628910""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
assert!(builder.append_csv(csv).is_err());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn bad_column_count1() {
|
|
||||||
let csv_content = r#"city,country,pop
|
|
||||||
"Boston","United States","4628910", "too much
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content"#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
assert!(builder.append_csv(csv).is_err());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn bad_column_count2() {
|
|
||||||
let csv_content = r#"city,country,pop
|
|
||||||
"Boston","United States""#;
|
|
||||||
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
assert!(builder.append_csv(csv).is_err());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -253,33 +253,4 @@ mod test {
|
|||||||
{"id": 2,"a": 0,"b": 0},
|
{"id": 2,"a": 0,"b": 0},
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn csv_types_dont_panic() {
|
|
||||||
let csv1_content =
|
|
||||||
"id:number,b:boolean,c,d:number\n1,,,\n2,true,doggo,2\n3,false,the best doggo,-2\n4,,\"Hello, World!\",2.5";
|
|
||||||
let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv1).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn out_of_order_csv_fields() {
|
|
||||||
let csv1_content = "id:number,b\n1,0";
|
|
||||||
let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
|
|
||||||
|
|
||||||
let csv2_content = "id:number,a,b\n2,0,0";
|
|
||||||
let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content));
|
|
||||||
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
builder.append_csv(csv1).unwrap();
|
|
||||||
builder.append_csv(csv2).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,6 @@ use std::sync::RwLock;
|
|||||||
|
|
||||||
use super::metadata::{FieldIdMapWithMetadata, Metadata};
|
use super::metadata::{FieldIdMapWithMetadata, Metadata};
|
||||||
use super::MutFieldIdMapper;
|
use super::MutFieldIdMapper;
|
||||||
use crate::documents::FieldIdMapper;
|
|
||||||
use crate::FieldId;
|
use crate::FieldId;
|
||||||
|
|
||||||
/// A fields ids map that can be globally updated to add fields
|
/// A fields ids map that can be globally updated to add fields
|
||||||
@ -14,22 +13,12 @@ pub struct GlobalFieldsIdsMap<'indexing> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct LocalFieldsIdsMap {
|
struct LocalFieldsIdsMap {
|
||||||
names_ids: BTreeMap<String, FieldId>,
|
names_ids: BTreeMap<String, FieldId>,
|
||||||
ids_names: BTreeMap<FieldId, String>,
|
ids_names: BTreeMap<FieldId, String>,
|
||||||
metadata: BTreeMap<FieldId, Metadata>,
|
metadata: BTreeMap<FieldId, Metadata>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FieldIdMapper for LocalFieldsIdsMap {
|
|
||||||
fn id(&self, name: &str) -> Option<FieldId> {
|
|
||||||
self.id(name)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn name(&self, id: FieldId) -> Option<&str> {
|
|
||||||
self.name(id)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LocalFieldsIdsMap {
|
impl LocalFieldsIdsMap {
|
||||||
fn new(global: &RwLock<FieldIdMapWithMetadata>) -> Self {
|
fn new(global: &RwLock<FieldIdMapWithMetadata>) -> Self {
|
||||||
let global = global.read().unwrap();
|
let global = global.read().unwrap();
|
||||||
@ -116,10 +105,6 @@ impl<'indexing> GlobalFieldsIdsMap<'indexing> {
|
|||||||
|
|
||||||
self.local.name(id)
|
self.local.name(id)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn local_map(&self) -> &LocalFieldsIdsMap {
|
|
||||||
&self.local
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'indexing> MutFieldIdMapper for GlobalFieldsIdsMap<'indexing> {
|
impl<'indexing> MutFieldIdMapper for GlobalFieldsIdsMap<'indexing> {
|
||||||
|
@ -102,6 +102,7 @@ impl Metadata {
|
|||||||
rules: &'rules [LocalizedAttributesRule],
|
rules: &'rules [LocalizedAttributesRule],
|
||||||
) -> Option<&'rules [Language]> {
|
) -> Option<&'rules [Language]> {
|
||||||
let localized_attributes_rule_id = self.localized_attributes_rule_id?.get();
|
let localized_attributes_rule_id = self.localized_attributes_rule_id?.get();
|
||||||
|
// - 1: `localized_attributes_rule_id` is NonZero
|
||||||
let rule = rules.get((localized_attributes_rule_id - 1) as usize).unwrap();
|
let rule = rules.get((localized_attributes_rule_id - 1) as usize).unwrap();
|
||||||
Some(rule.locales())
|
Some(rule.locales())
|
||||||
}
|
}
|
||||||
@ -160,6 +161,7 @@ impl MetadataBuilder {
|
|||||||
.iter()
|
.iter()
|
||||||
.flat_map(|v| v.iter())
|
.flat_map(|v| v.iter())
|
||||||
.position(|rule| rule.match_str(field))
|
.position(|rule| rule.match_str(field))
|
||||||
|
// saturating_add(1): make `id` `NonZero`
|
||||||
.map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap());
|
.map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap());
|
||||||
|
|
||||||
Metadata { searchable, filterable, sortable, localized_attributes_rule_id }
|
Metadata { searchable, filterable, sortable, localized_attributes_rule_id }
|
||||||
|
@ -46,6 +46,7 @@ fn encode_f64_into_ordered_bytes(
|
|||||||
f: f64,
|
f: f64,
|
||||||
buffer: &mut [u8; 16],
|
buffer: &mut [u8; 16],
|
||||||
) -> Result<(), InvalidGloballyOrderedFloatError> {
|
) -> Result<(), InvalidGloballyOrderedFloatError> {
|
||||||
|
// write the globally ordered float
|
||||||
let bytes = f64_into_bytes(f).ok_or(InvalidGloballyOrderedFloatError { float: f })?;
|
let bytes = f64_into_bytes(f).ok_or(InvalidGloballyOrderedFloatError { float: f })?;
|
||||||
buffer[..8].copy_from_slice(&bytes[..]);
|
buffer[..8].copy_from_slice(&bytes[..]);
|
||||||
// Then the f64 value just to be able to read it back
|
// Then the f64 value just to be able to read it back
|
||||||
|
@ -160,7 +160,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn size(&self) -> i64 {
|
fn size(&self) -> i64 {
|
||||||
self.document.len() as i64
|
self.document.top_level_fields_count() as i64
|
||||||
}
|
}
|
||||||
|
|
||||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||||
|
@ -87,23 +87,10 @@ pub enum WriterOperation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub enum ArroyOperation {
|
pub enum ArroyOperation {
|
||||||
/// TODO: call when deleting regular documents
|
DeleteVectors { docid: DocumentId },
|
||||||
DeleteVectors {
|
SetVectors { docid: DocumentId, embedder_id: u8, embeddings: Vec<Embedding> },
|
||||||
docid: DocumentId,
|
SetVector { docid: DocumentId, embedder_id: u8, embedding: Embedding },
|
||||||
},
|
Finish { configs: Vec<IndexEmbeddingConfig> },
|
||||||
SetVectors {
|
|
||||||
docid: DocumentId,
|
|
||||||
embedder_id: u8,
|
|
||||||
embeddings: Vec<Embedding>,
|
|
||||||
},
|
|
||||||
SetVector {
|
|
||||||
docid: DocumentId,
|
|
||||||
embedder_id: u8,
|
|
||||||
embedding: Embedding,
|
|
||||||
},
|
|
||||||
Finish {
|
|
||||||
configs: Vec<IndexEmbeddingConfig>,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct DbOperation {
|
pub struct DbOperation {
|
||||||
@ -334,7 +321,6 @@ impl DocidsSender for FacetDocidsSender<'_> {
|
|||||||
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
||||||
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
||||||
let database = Database::from(facet_kind);
|
let database = Database::from(facet_kind);
|
||||||
// let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
|
|
||||||
let entry = match facet_kind {
|
let entry = match facet_kind {
|
||||||
// skip level group size
|
// skip level group size
|
||||||
FacetKind::String | FacetKind::Number => {
|
FacetKind::String | FacetKind::Number => {
|
||||||
|
@ -21,11 +21,8 @@ pub trait Document<'doc> {
|
|||||||
/// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning they are **not returned** by this method.
|
/// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning they are **not returned** by this method.
|
||||||
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>>;
|
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>>;
|
||||||
|
|
||||||
fn len(&self) -> usize;
|
/// Number of top level fields, **excluding** `_vectors` and `_geo`
|
||||||
|
fn top_level_fields_count(&self) -> usize;
|
||||||
fn is_empty(&self) -> bool {
|
|
||||||
self.len() == 0
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the **top-level** with the specified name, if exists.
|
/// Get the **top-level** with the specified name, if exists.
|
||||||
///
|
///
|
||||||
@ -105,8 +102,15 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> {
|
|||||||
self.field("_geo")
|
self.field("_geo")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn len(&self) -> usize {
|
fn top_level_fields_count(&self) -> usize {
|
||||||
self.content.iter().count()
|
let has_vectors_field = self.vectors_field().unwrap_or(None).is_some();
|
||||||
|
let has_geo_field = self.geo_field().unwrap_or(None).is_some();
|
||||||
|
let count = self.content.iter().count();
|
||||||
|
match (has_vectors_field, has_geo_field) {
|
||||||
|
(true, true) => count - 2,
|
||||||
|
(true, false) | (false, true) => count - 1,
|
||||||
|
(false, false) => count,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn top_level_field(&self, k: &str) -> Result<Option<&'t RawValue>> {
|
fn top_level_field(&self, k: &str) -> Result<Option<&'t RawValue>> {
|
||||||
@ -162,8 +166,15 @@ impl<'a, 'doc> Document<'doc> for DocumentFromVersions<'a, 'doc> {
|
|||||||
Ok(self.versions.geo_field())
|
Ok(self.versions.geo_field())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn len(&self) -> usize {
|
fn top_level_fields_count(&self) -> usize {
|
||||||
self.versions.len()
|
let has_vectors_field = self.vectors_field().unwrap_or(None).is_some();
|
||||||
|
let has_geo_field = self.geo_field().unwrap_or(None).is_some();
|
||||||
|
let count = self.versions.len();
|
||||||
|
match (has_vectors_field, has_geo_field) {
|
||||||
|
(true, true) => count - 2,
|
||||||
|
(true, false) | (false, true) => count - 1,
|
||||||
|
(false, false) => count,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
|
fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
|
||||||
@ -243,7 +254,7 @@ impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d>
|
|||||||
db.geo_field()
|
db.geo_field()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn len(&self) -> usize {
|
fn top_level_fields_count(&self) -> usize {
|
||||||
self.iter_top_level_fields().count()
|
self.iter_top_level_fields().count()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -274,8 +285,8 @@ where
|
|||||||
D::geo_field(self)
|
D::geo_field(self)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn len(&self) -> usize {
|
fn top_level_fields_count(&self) -> usize {
|
||||||
D::len(self)
|
D::top_level_fields_count(self)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
|
fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
|
||||||
|
@ -241,6 +241,7 @@ impl<'extractor> BalancedCaches<'extractor> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// SAFETY: No Thread-Local inside
|
||||||
unsafe impl MostlySend for BalancedCaches<'_> {}
|
unsafe impl MostlySend for BalancedCaches<'_> {}
|
||||||
|
|
||||||
struct NormalCaches<'extractor> {
|
struct NormalCaches<'extractor> {
|
||||||
|
@ -140,7 +140,6 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
|
|||||||
)?;
|
)?;
|
||||||
document_extractor_data.docids_delta.insert_add_u32(docid);
|
document_extractor_data.docids_delta.insert_add_u32(docid);
|
||||||
self.document_sender.uncompressed(docid, external_docid, content).unwrap();
|
self.document_sender.uncompressed(docid, external_docid, content).unwrap();
|
||||||
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -60,9 +60,11 @@ pub struct GeoExtractorData<'extractor> {
|
|||||||
/// point being updated, we first put it in the deleted and then in the inserted.
|
/// point being updated, we first put it in the deleted and then in the inserted.
|
||||||
removed: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
|
removed: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
|
||||||
inserted: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
|
inserted: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
|
||||||
/// TODO Do the doc
|
/// Contains a packed list of `ExtractedGeoPoint` of the inserted geo points
|
||||||
|
/// data structures if we have spilled to disk.
|
||||||
spilled_removed: Option<BufWriter<File>>,
|
spilled_removed: Option<BufWriter<File>>,
|
||||||
/// TODO Do the doc
|
/// Contains a packed list of `ExtractedGeoPoint` of the inserted geo points
|
||||||
|
/// data structures if we have spilled to disk.
|
||||||
spilled_inserted: Option<BufWriter<File>>,
|
spilled_inserted: Option<BufWriter<File>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,7 +139,6 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
|
|||||||
fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||||
Ok(RefCell::new(GeoExtractorData {
|
Ok(RefCell::new(GeoExtractorData {
|
||||||
removed: bumpalo::collections::Vec::new_in(extractor_alloc),
|
removed: bumpalo::collections::Vec::new_in(extractor_alloc),
|
||||||
// inserted: Uell::new_in(extractor_alloc),
|
|
||||||
inserted: bumpalo::collections::Vec::new_in(extractor_alloc),
|
inserted: bumpalo::collections::Vec::new_in(extractor_alloc),
|
||||||
spilled_inserted: None,
|
spilled_inserted: None,
|
||||||
spilled_removed: None,
|
spilled_removed: None,
|
||||||
@ -242,7 +243,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts and validate the latitude and latitude from a document geo field.
|
/// Extracts and validates the latitude and latitude from a document geo field.
|
||||||
///
|
///
|
||||||
/// It can be of the form `{ "lat": 0.0, "lng": "1.0" }`.
|
/// It can be of the form `{ "lat": 0.0, "lng": "1.0" }`.
|
||||||
pub fn extract_geo_coordinates(
|
pub fn extract_geo_coordinates(
|
||||||
|
@ -35,7 +35,6 @@ pub struct WordDocidsBalancedCaches<'extractor> {
|
|||||||
unsafe impl<'extractor> MostlySend for WordDocidsBalancedCaches<'extractor> {}
|
unsafe impl<'extractor> MostlySend for WordDocidsBalancedCaches<'extractor> {}
|
||||||
|
|
||||||
impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
||||||
/// TODO Make sure to give the same max_memory to all of them, without splitting it
|
|
||||||
pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
|
pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
|
||||||
Self {
|
Self {
|
||||||
word_fid_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
|
word_fid_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
|
||||||
@ -416,6 +415,6 @@ impl WordDocidsExtractors {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||||
Ok(vec!["_geo"])
|
Ok(Vec::new())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,7 +25,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||||
Ok(vec!["_geo"])
|
Ok(Vec::new())
|
||||||
}
|
}
|
||||||
|
|
||||||
// This method is reimplemented to count the number of words in the document in each field
|
// This method is reimplemented to count the number of words in the document in each field
|
||||||
|
@ -43,6 +43,7 @@ impl<'pl> DocumentOperation<'pl> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")]
|
||||||
pub fn into_changes<MSP, SP>(
|
pub fn into_changes<MSP, SP>(
|
||||||
self,
|
self,
|
||||||
indexer: &'pl Bump,
|
indexer: &'pl Bump,
|
||||||
|
@ -103,7 +103,6 @@ where
|
|||||||
// prevent moving the field_distribution and document_ids in the inner closure...
|
// prevent moving the field_distribution and document_ids in the inner closure...
|
||||||
let field_distribution = &mut field_distribution;
|
let field_distribution = &mut field_distribution;
|
||||||
let document_ids = &mut document_ids;
|
let document_ids = &mut document_ids;
|
||||||
// TODO manage the errors correctly
|
|
||||||
let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
|
let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
|
||||||
let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
|
let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
@ -176,7 +175,6 @@ where
|
|||||||
)?;
|
)?;
|
||||||
|
|
||||||
// TODO Word Docids Merger
|
// TODO Word Docids Merger
|
||||||
// extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
|
|
||||||
{
|
{
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
@ -190,7 +188,6 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Word Fid Docids Merging
|
// Word Fid Docids Merging
|
||||||
// extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
|
|
||||||
{
|
{
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
@ -204,7 +201,6 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Exact Word Docids Merging
|
// Exact Word Docids Merging
|
||||||
// extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
|
|
||||||
{
|
{
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
@ -218,7 +214,6 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Word Position Docids Merging
|
// Word Position Docids Merging
|
||||||
// extractor_sender.send_searchable::<WordPositionDocids>(word_position_docids).unwrap();
|
|
||||||
{
|
{
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
@ -232,7 +227,6 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fid Word Count Docids Merging
|
// Fid Word Count Docids Merging
|
||||||
// extractor_sender.send_searchable::<FidWordCountDocids>(fid_word_count_docids).unwrap();
|
|
||||||
{
|
{
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
@ -521,9 +515,6 @@ fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result<Option<PrefixDelt
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "words_fst");
|
|
||||||
let _entered = span.enter();
|
|
||||||
|
|
||||||
let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?;
|
let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?;
|
||||||
index.main.remap_types::<Str, Bytes>().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?;
|
index.main.remap_types::<Str, Bytes>().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?;
|
||||||
if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data {
|
if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data {
|
||||||
|
@ -2,7 +2,6 @@ pub use document_change::{Deletion, DocumentChange, Insertion, Update};
|
|||||||
pub use merger::{
|
pub use merger::{
|
||||||
merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases, FacetFieldIdsDelta,
|
merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases, FacetFieldIdsDelta,
|
||||||
};
|
};
|
||||||
pub use top_level_map::{CowStr, TopLevelMap};
|
|
||||||
|
|
||||||
use super::del_add::DelAdd;
|
use super::del_add::DelAdd;
|
||||||
use crate::FieldId;
|
use crate::FieldId;
|
||||||
@ -19,7 +18,6 @@ mod parallel_iterator_ext;
|
|||||||
mod ref_cell_ext;
|
mod ref_cell_ext;
|
||||||
pub(crate) mod steps;
|
pub(crate) mod steps;
|
||||||
pub(crate) mod thread_local;
|
pub(crate) mod thread_local;
|
||||||
mod top_level_map;
|
|
||||||
pub mod vector_document;
|
pub mod vector_document;
|
||||||
mod word_fst_builder;
|
mod word_fst_builder;
|
||||||
mod words_prefix_docids;
|
mod words_prefix_docids;
|
||||||
|
@ -1,66 +0,0 @@
|
|||||||
use std::borrow::{Borrow, Cow};
|
|
||||||
use std::collections::BTreeMap;
|
|
||||||
use std::{fmt, ops};
|
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use serde_json::value::RawValue;
|
|
||||||
use serde_json::{Map, Value};
|
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize)]
|
|
||||||
pub struct TopLevelMap<'p>(#[serde(borrow)] pub BTreeMap<CowStr<'p>, &'p RawValue>);
|
|
||||||
|
|
||||||
impl TryFrom<&'_ TopLevelMap<'_>> for Map<String, Value> {
|
|
||||||
type Error = serde_json::Error;
|
|
||||||
|
|
||||||
fn try_from(tlmap: &TopLevelMap<'_>) -> Result<Self, Self::Error> {
|
|
||||||
let mut object = Map::new();
|
|
||||||
for (k, v) in &tlmap.0 {
|
|
||||||
let value = serde_json::from_str(v.get())?;
|
|
||||||
object.insert(k.to_string(), value);
|
|
||||||
}
|
|
||||||
Ok(object)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TryFrom<TopLevelMap<'_>> for Map<String, Value> {
|
|
||||||
type Error = serde_json::Error;
|
|
||||||
|
|
||||||
fn try_from(tlmap: TopLevelMap<'_>) -> Result<Self, Self::Error> {
|
|
||||||
TryFrom::try_from(&tlmap)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'p> ops::Deref for TopLevelMap<'p> {
|
|
||||||
type Target = BTreeMap<CowStr<'p>, &'p RawValue>;
|
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
|
||||||
&self.0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ops::DerefMut for TopLevelMap<'_> {
|
|
||||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
|
||||||
&mut self.0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
|
|
||||||
pub struct CowStr<'p>(#[serde(borrow)] pub Cow<'p, str>);
|
|
||||||
|
|
||||||
impl fmt::Display for CowStr<'_> {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
fmt::Display::fmt(&self.0, f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AsRef<str> for CowStr<'_> {
|
|
||||||
fn as_ref(&self) -> &str {
|
|
||||||
self.0.as_ref()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'doc> Borrow<str> for CowStr<'doc> {
|
|
||||||
fn borrow(&self) -> &str {
|
|
||||||
self.0.borrow()
|
|
||||||
}
|
|
||||||
}
|
|
@ -113,7 +113,10 @@ pub fn to_call_stats<R: std::io::Read>(
|
|||||||
|
|
||||||
let span = *span;
|
let span = *span;
|
||||||
if let Some(parent_id) = span.parent_id {
|
if let Some(parent_id) = span.parent_id {
|
||||||
let (_, _, parent_self_time) = spans.get_mut(&parent_id).unwrap();
|
let Some((_, _, parent_self_time)) = spans.get_mut(&parent_id) else {
|
||||||
|
let (c, _) = calls.get_mut(&span.call_id).unwrap();
|
||||||
|
panic!("parent span not found for span: module_path: {:?}, name: {:?}, target: {:?}", c.module_path.as_deref().unwrap_or_default(), c.name, c.target);
|
||||||
|
};
|
||||||
parent_self_time.add_child_range(self_range.clone())
|
parent_self_time.add_child_range(self_range.clone())
|
||||||
}
|
}
|
||||||
total_self_time.add_child_range(self_range);
|
total_self_time.add_child_range(self_range);
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "movies.json",
|
"name": "movies.json",
|
||||||
"run_count": 1,
|
"run_count": 10,
|
||||||
"extra_cli_args": [],
|
"extra_cli_args": [],
|
||||||
"assets": {
|
"assets": {
|
||||||
"movies.json": {
|
"movies.json": {
|
||||||
|
Loading…
Reference in New Issue
Block a user