mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 00:55:00 +08:00
Merge #458
458: Nested fields r=Kerollmops a=irevoire For the following document: ```json { "id": 1, "person": { "name": "tamo", "age": 25, } } ``` Suppose the user sets `person` as a filterable attribute. We need to store `person` in the filterable _obviously_. But we also need to keep track of `person.name` and `person.age` somewhere. That’s where I changed a little bit the logic of the engine. Currently, we have a function called `faceted_field` that returns the union of the filterable and sortable. I renamed this function in `user_defined_faceted_field`. And now, when we finish indexing documents, we look at all the fields and see if they « match » a `user_defined_faceted_field`. So in our case: - does `id` match `person`: 🔴 - does `person.name` match `person`: 🟢 - does `person.age` match `person`: 🟢 And thus, we insert in the database the following faceted fields: `person, person.name, person.age`. The good thing about that solution is that we generate everything during the indexing phase, and then during the search, we can access our field without recomputing too much globbing. ----- Now the bad thing is that I had to create a new db. And if that was only one db, that would be ok, but actually, I need to do the same for the: - Displayed attributes - Attributes to retrieve - Attributes to highlight - Attribute to crop `@Kerollmops` Do you think there is a better way to do it? Apart from all the code, can we have a problem because we have too many dbs? Co-authored-by: Irevoire <tamo@meilisearch.com> Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
commit
80ae020bee
@ -1,6 +1,6 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
resolver = "2"
|
resolver = "2"
|
||||||
members = ["milli", "filter-parser", "http-ui", "benchmarks", "infos", "helpers", "cli"]
|
members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"]
|
||||||
default-members = ["milli"]
|
default-members = ["milli"]
|
||||||
|
|
||||||
[profile.dev]
|
[profile.dev]
|
||||||
|
@ -70,7 +70,8 @@ fn indexing_songs_default(c: &mut Criterion) {
|
|||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
@ -120,7 +121,8 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
@ -134,14 +136,16 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
|
|||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
@ -190,7 +194,8 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
|
|||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
||||||
|
|
||||||
@ -236,7 +241,8 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
|
|||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
@ -281,7 +287,8 @@ fn indexing_wiki(c: &mut Criterion) {
|
|||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
|
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
@ -323,7 +330,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
let documents =
|
let documents =
|
||||||
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv");
|
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
@ -339,7 +347,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
|
|||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents =
|
let documents =
|
||||||
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv");
|
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv");
|
||||||
@ -349,7 +358,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents =
|
let documents =
|
||||||
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv");
|
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv");
|
||||||
@ -400,7 +410,8 @@ fn indexing_movies_default(c: &mut Criterion) {
|
|||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
@ -447,7 +458,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
@ -462,7 +474,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
|
|||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
@ -470,7 +483,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
|
|||||||
|
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json");
|
let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
@ -525,7 +539,8 @@ fn indexing_geo(c: &mut Criterion) {
|
|||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
|
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
|
@ -96,7 +96,8 @@ pub fn base_setup(conf: &Conf) -> Index {
|
|||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
let documents = documents_from(conf.dataset, conf.dataset_format);
|
let documents = documents_from(conf.dataset, conf.dataset_format);
|
||||||
|
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
|
@ -261,7 +261,8 @@ impl Performer for DocumentAddition {
|
|||||||
&config,
|
&config,
|
||||||
indexing_config,
|
indexing_config,
|
||||||
|step| indexing_callback(step, &bars),
|
|step| indexing_callback(step, &bars),
|
||||||
);
|
)
|
||||||
|
.unwrap();
|
||||||
addition.add_documents(reader)?;
|
addition.add_documents(reader)?;
|
||||||
|
|
||||||
std::thread::spawn(move || {
|
std::thread::spawn(move || {
|
||||||
|
15
flatten-serde-json/Cargo.toml
Normal file
15
flatten-serde-json/Cargo.toml
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
[package]
|
||||||
|
name = "flatten-serde-json"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
description = "Flatten serde-json objects like elastic search"
|
||||||
|
readme = "README.md"
|
||||||
|
author = ["Tamo tamo@meilisearch.com"]
|
||||||
|
repository = "https://github.com/irevoire/flatten-serde-json"
|
||||||
|
keywords = ["json", "flatten"]
|
||||||
|
categories = ["command-line-utilities"]
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
serde_json = "1.0"
|
153
flatten-serde-json/README.md
Normal file
153
flatten-serde-json/README.md
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
# Flatten serde Json
|
||||||
|
|
||||||
|
This crate flatten [`serde_json`](https://docs.rs/serde_json/latest/serde_json/) `Object` in a format
|
||||||
|
similar to [elastic search](https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html).
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### There is nothing to do
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "287947",
|
||||||
|
"title": "Shazam!",
|
||||||
|
"release_date": 1553299200,
|
||||||
|
"genres": [
|
||||||
|
"Action",
|
||||||
|
"Comedy",
|
||||||
|
"Fantasy"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Flattens to:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "287947",
|
||||||
|
"title": "Shazam!",
|
||||||
|
"release_date": 1553299200,
|
||||||
|
"genres": [
|
||||||
|
"Action",
|
||||||
|
"Comedy",
|
||||||
|
"Fantasy"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
------------
|
||||||
|
|
||||||
|
### Objects
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"a": {
|
||||||
|
"b": "c",
|
||||||
|
"d": "e",
|
||||||
|
"f": "g"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Flattens to:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"a.b": "c",
|
||||||
|
"a.d": "e",
|
||||||
|
"a.f": "g"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
------------
|
||||||
|
|
||||||
|
### Array of objects
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"a": [
|
||||||
|
{ "b": "c" },
|
||||||
|
{ "b": "d" },
|
||||||
|
{ "b": "e" },
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Flattens to:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"a.b": ["c", "d", "e"],
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
------------
|
||||||
|
|
||||||
|
### Array of objects with normal value in the array
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"a": [
|
||||||
|
42,
|
||||||
|
{ "b": "c" },
|
||||||
|
{ "b": "d" },
|
||||||
|
{ "b": "e" },
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Flattens to:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"a": 42,
|
||||||
|
"a.b": ["c", "d", "e"],
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
------------
|
||||||
|
|
||||||
|
### Array of objects of array of objects of ...
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"a": [
|
||||||
|
"b",
|
||||||
|
["c", "d"],
|
||||||
|
{ "e": ["f", "g"] },
|
||||||
|
[
|
||||||
|
{ "h": "i" },
|
||||||
|
{ "e": ["j", { "z": "y" }] },
|
||||||
|
],
|
||||||
|
["l"],
|
||||||
|
"m",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Flattens to:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"a": ["b", "c", "d", "l", "m"],
|
||||||
|
"a.e": ["f", "g", "j"],
|
||||||
|
"a.h": "i",
|
||||||
|
"a.e.z": "y",
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
------------
|
||||||
|
|
||||||
|
### Collision between a generated field name and an already existing field
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"a": {
|
||||||
|
"b": "c",
|
||||||
|
},
|
||||||
|
"a.b": "d",
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Flattens to:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"a.b": ["c", "d"],
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
26
flatten-serde-json/fuzz/Cargo.toml
Normal file
26
flatten-serde-json/fuzz/Cargo.toml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
[package]
|
||||||
|
name = "flatten_serde_json-fuzz"
|
||||||
|
version = "0.0.0"
|
||||||
|
authors = ["Automatically generated"]
|
||||||
|
publish = false
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[package.metadata]
|
||||||
|
cargo-fuzz = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
libfuzzer-sys = "0.4"
|
||||||
|
arbitrary-json = "0.1.1"
|
||||||
|
|
||||||
|
[dependencies.flatten_serde_json]
|
||||||
|
path = ".."
|
||||||
|
|
||||||
|
# Prevent this from interfering with workspaces
|
||||||
|
[workspace]
|
||||||
|
members = ["."]
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "flatten"
|
||||||
|
path = "fuzz_targets/flatten.rs"
|
||||||
|
test = false
|
||||||
|
doc = false
|
8
flatten-serde-json/fuzz/fuzz_targets/flatten.rs
Normal file
8
flatten-serde-json/fuzz/fuzz_targets/flatten.rs
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
#![no_main]
|
||||||
|
use arbitrary_json::ArbitraryObject;
|
||||||
|
use flatten_serde_json::flatten;
|
||||||
|
use libfuzzer_sys::fuzz_target;
|
||||||
|
|
||||||
|
fuzz_target!(|object: ArbitraryObject| {
|
||||||
|
let _ = flatten(&object);
|
||||||
|
});
|
264
flatten-serde-json/src/lib.rs
Normal file
264
flatten-serde-json/src/lib.rs
Normal file
@ -0,0 +1,264 @@
|
|||||||
|
#![doc = include_str!("../README.md")]
|
||||||
|
|
||||||
|
use serde_json::{json, Map, Value};
|
||||||
|
|
||||||
|
pub fn flatten(json: &Map<String, Value>) -> Map<String, Value> {
|
||||||
|
let mut obj = Map::new();
|
||||||
|
insert_object(&mut obj, None, json);
|
||||||
|
obj
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_object(
|
||||||
|
base_json: &mut Map<String, Value>,
|
||||||
|
base_key: Option<&str>,
|
||||||
|
object: &Map<String, Value>,
|
||||||
|
) {
|
||||||
|
for (key, value) in object {
|
||||||
|
let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}"));
|
||||||
|
|
||||||
|
if let Some(array) = value.as_array() {
|
||||||
|
insert_array(base_json, &new_key, array);
|
||||||
|
} else if let Some(object) = value.as_object() {
|
||||||
|
insert_object(base_json, Some(&new_key), object);
|
||||||
|
} else {
|
||||||
|
insert_value(base_json, &new_key, value.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_array(base_json: &mut Map<String, Value>, base_key: &str, array: &Vec<Value>) {
|
||||||
|
for value in array {
|
||||||
|
if let Some(object) = value.as_object() {
|
||||||
|
insert_object(base_json, Some(base_key), object);
|
||||||
|
} else if let Some(sub_array) = value.as_array() {
|
||||||
|
insert_array(base_json, base_key, sub_array);
|
||||||
|
} else {
|
||||||
|
insert_value(base_json, base_key, value.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_value(base_json: &mut Map<String, Value>, key: &str, to_insert: Value) {
|
||||||
|
debug_assert!(!to_insert.is_object());
|
||||||
|
debug_assert!(!to_insert.is_array());
|
||||||
|
|
||||||
|
// does the field aleardy exists?
|
||||||
|
if let Some(value) = base_json.get_mut(key) {
|
||||||
|
// is it already an array
|
||||||
|
if let Some(array) = value.as_array_mut() {
|
||||||
|
array.push(to_insert);
|
||||||
|
// or is there a collision
|
||||||
|
} else {
|
||||||
|
let value = std::mem::take(value);
|
||||||
|
base_json[key] = json!([value, to_insert]);
|
||||||
|
}
|
||||||
|
// if it does not exist we can push the value untouched
|
||||||
|
} else {
|
||||||
|
base_json.insert(key.to_string(), json!(to_insert));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn no_flattening() {
|
||||||
|
let mut base: Value = json!({
|
||||||
|
"id": "287947",
|
||||||
|
"title": "Shazam!",
|
||||||
|
"release_date": 1553299200,
|
||||||
|
"genres": [
|
||||||
|
"Action",
|
||||||
|
"Comedy",
|
||||||
|
"Fantasy"
|
||||||
|
]
|
||||||
|
});
|
||||||
|
let json = std::mem::take(base.as_object_mut().unwrap());
|
||||||
|
let flat = flatten(&json);
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"got:\n{}\nexpected:\n{}\n",
|
||||||
|
serde_json::to_string_pretty(&flat).unwrap(),
|
||||||
|
serde_json::to_string_pretty(&json).unwrap()
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(flat, json);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn flatten_object() {
|
||||||
|
let mut base: Value = json!({
|
||||||
|
"a": {
|
||||||
|
"b": "c",
|
||||||
|
"d": "e",
|
||||||
|
"f": "g"
|
||||||
|
}
|
||||||
|
});
|
||||||
|
let json = std::mem::take(base.as_object_mut().unwrap());
|
||||||
|
let flat = flatten(&json);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
&flat,
|
||||||
|
json!({
|
||||||
|
"a.b": "c",
|
||||||
|
"a.d": "e",
|
||||||
|
"a.f": "g"
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn flatten_array() {
|
||||||
|
let mut base: Value = json!({
|
||||||
|
"a": [
|
||||||
|
{ "b": "c" },
|
||||||
|
{ "b": "d" },
|
||||||
|
{ "b": "e" },
|
||||||
|
]
|
||||||
|
});
|
||||||
|
let json = std::mem::take(base.as_object_mut().unwrap());
|
||||||
|
let flat = flatten(&json);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
&flat,
|
||||||
|
json!({
|
||||||
|
"a.b": ["c", "d", "e"],
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
);
|
||||||
|
|
||||||
|
// here we must keep 42 in "a"
|
||||||
|
let mut base: Value = json!({
|
||||||
|
"a": [
|
||||||
|
42,
|
||||||
|
{ "b": "c" },
|
||||||
|
{ "b": "d" },
|
||||||
|
{ "b": "e" },
|
||||||
|
]
|
||||||
|
});
|
||||||
|
let json = std::mem::take(base.as_object_mut().unwrap());
|
||||||
|
let flat = flatten(&json);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
&flat,
|
||||||
|
json!({
|
||||||
|
"a": 42,
|
||||||
|
"a.b": ["c", "d", "e"],
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn collision_with_object() {
|
||||||
|
let mut base: Value = json!({
|
||||||
|
"a": {
|
||||||
|
"b": "c",
|
||||||
|
},
|
||||||
|
"a.b": "d",
|
||||||
|
});
|
||||||
|
let json = std::mem::take(base.as_object_mut().unwrap());
|
||||||
|
let flat = flatten(&json);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
&flat,
|
||||||
|
json!({
|
||||||
|
"a.b": ["c", "d"],
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn collision_with_array() {
|
||||||
|
let mut base: Value = json!({
|
||||||
|
"a": [
|
||||||
|
{ "b": "c" },
|
||||||
|
{ "b": "d", "c": "e" },
|
||||||
|
[35],
|
||||||
|
],
|
||||||
|
"a.b": "f",
|
||||||
|
});
|
||||||
|
let json = std::mem::take(base.as_object_mut().unwrap());
|
||||||
|
let flat = flatten(&json);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
&flat,
|
||||||
|
json!({
|
||||||
|
"a.b": ["c", "d", "f"],
|
||||||
|
"a.c": "e",
|
||||||
|
"a": 35,
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn flatten_nested_arrays() {
|
||||||
|
let mut base: Value = json!({
|
||||||
|
"a": [
|
||||||
|
["b", "c"],
|
||||||
|
{ "d": "e" },
|
||||||
|
["f", "g"],
|
||||||
|
[
|
||||||
|
{ "h": "i" },
|
||||||
|
{ "d": "j" },
|
||||||
|
],
|
||||||
|
["k", "l"],
|
||||||
|
]
|
||||||
|
});
|
||||||
|
let json = std::mem::take(base.as_object_mut().unwrap());
|
||||||
|
let flat = flatten(&json);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
&flat,
|
||||||
|
json!({
|
||||||
|
"a": ["b", "c", "f", "g", "k", "l"],
|
||||||
|
"a.d": ["e", "j"],
|
||||||
|
"a.h": "i",
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn flatten_nested_arrays_and_objects() {
|
||||||
|
let mut base: Value = json!({
|
||||||
|
"a": [
|
||||||
|
"b",
|
||||||
|
["c", "d"],
|
||||||
|
{ "e": ["f", "g"] },
|
||||||
|
[
|
||||||
|
{ "h": "i" },
|
||||||
|
{ "e": ["j", { "z": "y" }] },
|
||||||
|
],
|
||||||
|
["l"],
|
||||||
|
"m",
|
||||||
|
]
|
||||||
|
});
|
||||||
|
let json = std::mem::take(base.as_object_mut().unwrap());
|
||||||
|
let flat = flatten(&json);
|
||||||
|
|
||||||
|
println!("{}", serde_json::to_string_pretty(&flat).unwrap());
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
&flat,
|
||||||
|
json!({
|
||||||
|
"a": ["b", "c", "d", "l", "m"],
|
||||||
|
"a.e": ["f", "g", "j"],
|
||||||
|
"a.h": "i",
|
||||||
|
"a.e.z": "y",
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
11
flatten-serde-json/src/main.rs
Normal file
11
flatten-serde-json/src/main.rs
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
use std::io::stdin;
|
||||||
|
|
||||||
|
use flatten_serde_json::flatten;
|
||||||
|
use serde_json::{Map, Value};
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let json: Map<String, Value> = serde_json::from_reader(stdin()).unwrap();
|
||||||
|
|
||||||
|
let result = flatten(&json);
|
||||||
|
println!("{}", serde_json::to_string_pretty(&result).unwrap());
|
||||||
|
}
|
@ -410,7 +410,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
GLOBAL_CONFIG.get().unwrap(),
|
GLOBAL_CONFIG.get().unwrap(),
|
||||||
indexing_config,
|
indexing_config,
|
||||||
indexing_callback,
|
indexing_callback,
|
||||||
);
|
)?;
|
||||||
|
|
||||||
let reader = match encoding.as_deref() {
|
let reader = match encoding.as_deref() {
|
||||||
Some("gzip") => Box::new(GzDecoder::new(content)),
|
Some("gzip") => Box::new(GzDecoder::new(content)),
|
||||||
|
@ -14,6 +14,7 @@ crossbeam-channel = "0.5.2"
|
|||||||
either = "1.6.1"
|
either = "1.6.1"
|
||||||
fst = "0.4.7"
|
fst = "0.4.7"
|
||||||
fxhash = "0.2.1"
|
fxhash = "0.2.1"
|
||||||
|
flatten-serde-json = { path = "../flatten-serde-json" }
|
||||||
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
|
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
|
||||||
geoutils = "0.4.1"
|
geoutils = "0.4.1"
|
||||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
||||||
|
@ -49,6 +49,24 @@ impl DocumentsBatchIndex {
|
|||||||
pub fn name(&self, id: FieldId) -> Option<&String> {
|
pub fn name(&self, id: FieldId) -> Option<&String> {
|
||||||
self.0.get_by_left(&id)
|
self.0.get_by_left(&id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn recreate_json(
|
||||||
|
&self,
|
||||||
|
document: &obkv::KvReaderU16,
|
||||||
|
) -> Result<serde_json::Map<String, serde_json::Value>, crate::Error> {
|
||||||
|
let mut map = serde_json::Map::new();
|
||||||
|
|
||||||
|
for (k, v) in document.iter() {
|
||||||
|
// TODO: TAMO: update the error type
|
||||||
|
let key =
|
||||||
|
self.0.get_by_left(&k).ok_or(crate::error::InternalError::DatabaseClosing)?.clone();
|
||||||
|
let value = serde_json::from_slice::<serde_json::Value>(v)
|
||||||
|
.map_err(crate::error::InternalError::SerdeJson)?;
|
||||||
|
map.insert(key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(map)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
@ -27,6 +27,7 @@ pub enum InternalError {
|
|||||||
DatabaseClosing,
|
DatabaseClosing,
|
||||||
DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> },
|
DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> },
|
||||||
FieldIdMapMissingEntry(FieldIdMapMissingEntry),
|
FieldIdMapMissingEntry(FieldIdMapMissingEntry),
|
||||||
|
FieldIdMappingMissingEntry { key: FieldId },
|
||||||
Fst(fst::Error),
|
Fst(fst::Error),
|
||||||
GrenadInvalidCompressionType,
|
GrenadInvalidCompressionType,
|
||||||
GrenadInvalidFormatVersion,
|
GrenadInvalidFormatVersion,
|
||||||
@ -59,7 +60,7 @@ pub enum UserError {
|
|||||||
DocumentLimitReached,
|
DocumentLimitReached,
|
||||||
InvalidDocumentId { document_id: Value },
|
InvalidDocumentId { document_id: Value },
|
||||||
InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
|
InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
|
||||||
InvalidGeoField { document_id: Value, object: Value },
|
InvalidGeoField { document_id: Value },
|
||||||
InvalidFilter(String),
|
InvalidFilter(String),
|
||||||
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
|
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
|
||||||
SortRankingRuleMissing,
|
SortRankingRuleMissing,
|
||||||
@ -187,6 +188,9 @@ impl fmt::Display for InternalError {
|
|||||||
write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name)
|
write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name)
|
||||||
}
|
}
|
||||||
Self::FieldIdMapMissingEntry(error) => error.fmt(f),
|
Self::FieldIdMapMissingEntry(error) => error.fmt(f),
|
||||||
|
Self::FieldIdMappingMissingEntry { key } => {
|
||||||
|
write!(f, "Missing {} in the field id mapping.", key)
|
||||||
|
}
|
||||||
Self::Fst(error) => error.fmt(f),
|
Self::Fst(error) => error.fmt(f),
|
||||||
Self::GrenadInvalidCompressionType => {
|
Self::GrenadInvalidCompressionType => {
|
||||||
f.write_str("Invalid compression type have been specified to grenad.")
|
f.write_str("Invalid compression type have been specified to grenad.")
|
||||||
@ -226,19 +230,15 @@ impl fmt::Display for UserError {
|
|||||||
name_list
|
name_list
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
Self::InvalidGeoField { document_id, object } => {
|
Self::InvalidGeoField { document_id } => {
|
||||||
let document_id = match document_id {
|
let document_id = match document_id {
|
||||||
Value::String(id) => id.clone(),
|
Value::String(id) => id.clone(),
|
||||||
_ => document_id.to_string(),
|
_ => document_id.to_string(),
|
||||||
};
|
};
|
||||||
let object = match object {
|
|
||||||
Value::String(id) => id.clone(),
|
|
||||||
_ => object.to_string(),
|
|
||||||
};
|
|
||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"The document with the id: `{}` contains an invalid _geo field: `{}`.",
|
"The document with the id: `{}` contains an invalid `_geo` field.",
|
||||||
document_id, object
|
document_id
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
Self::InvalidDocumentId { document_id } => {
|
Self::InvalidDocumentId { document_id } => {
|
||||||
|
@ -31,6 +31,7 @@ pub mod main_key {
|
|||||||
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
|
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
|
||||||
pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
|
pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
|
||||||
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
||||||
|
pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields";
|
||||||
pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
|
pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
|
||||||
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
|
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
|
||||||
pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution";
|
pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution";
|
||||||
@ -567,12 +568,46 @@ impl Index {
|
|||||||
Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect())
|
Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
/* faceted documents ids */
|
/* faceted fields */
|
||||||
|
|
||||||
|
/// Writes the faceted fields in the database.
|
||||||
|
pub(crate) fn put_faceted_fields(
|
||||||
|
&self,
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
fields: &HashSet<String>,
|
||||||
|
) -> heed::Result<()> {
|
||||||
|
self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::HIDDEN_FACETED_FIELDS_KEY, fields)
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the faceted fields names.
|
/// Returns the faceted fields names.
|
||||||
|
pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> {
|
||||||
|
Ok(self
|
||||||
|
.main
|
||||||
|
.get::<_, Str, SerdeJson<_>>(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)?
|
||||||
|
.unwrap_or_default())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Identical to `faceted_fields`, but returns ids instead.
|
||||||
|
pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> {
|
||||||
|
let fields = self.faceted_fields(rtxn)?;
|
||||||
|
let fields_ids_map = self.fields_ids_map(rtxn)?;
|
||||||
|
|
||||||
|
let mut fields_ids = HashSet::new();
|
||||||
|
for name in fields {
|
||||||
|
if let Some(field_id) = fields_ids_map.id(&name) {
|
||||||
|
fields_ids.insert(field_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(fields_ids)
|
||||||
|
}
|
||||||
|
|
||||||
|
/* faceted documents ids */
|
||||||
|
|
||||||
|
/// Returns the user defined faceted fields names.
|
||||||
///
|
///
|
||||||
/// Faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields.
|
/// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields.
|
||||||
pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> {
|
pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> {
|
||||||
let filterable_fields = self.filterable_fields(rtxn)?;
|
let filterable_fields = self.filterable_fields(rtxn)?;
|
||||||
let sortable_fields = self.sortable_fields(rtxn)?;
|
let sortable_fields = self.sortable_fields(rtxn)?;
|
||||||
let distinct_field = self.distinct_field(rtxn)?;
|
let distinct_field = self.distinct_field(rtxn)?;
|
||||||
@ -592,8 +627,8 @@ impl Index {
|
|||||||
Ok(faceted_fields)
|
Ok(faceted_fields)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Identical to `faceted_fields`, but returns ids instead.
|
/// Identical to `user_defined_faceted_fields`, but returns ids instead.
|
||||||
pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> {
|
pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> {
|
||||||
let fields = self.faceted_fields(rtxn)?;
|
let fields = self.faceted_fields(rtxn)?;
|
||||||
let fields_ids_map = self.fields_ids_map(rtxn)?;
|
let fields_ids_map = self.fields_ids_map(rtxn)?;
|
||||||
|
|
||||||
@ -1040,13 +1075,14 @@ pub(crate) mod tests {
|
|||||||
let content = documents!([
|
let content = documents!([
|
||||||
{ "id": 1, "name": "kevin" },
|
{ "id": 1, "name": "kevin" },
|
||||||
{ "id": 2, "name": "bob", "age": 20 },
|
{ "id": 2, "name": "bob", "age": 20 },
|
||||||
{ "id": 2, "name": "bob", "age": 20 }
|
{ "id": 2, "name": "bob", "age": 20 },
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1067,11 +1103,12 @@ pub(crate) mod tests {
|
|||||||
// field_distribution in the end
|
// field_distribution in the end
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
let content = documents!([
|
let content = documents!([
|
||||||
{ "id": 1, "name": "kevin" },
|
{ "id": 1, "name": "kevin" },
|
||||||
{ "id": 2, "name": "bob", "age": 20 },
|
{ "id": 2, "name": "bob", "age": 20 },
|
||||||
{ "id": 2, "name": "bob", "age": 20 }
|
{ "id": 2, "name": "bob", "age": 20 },
|
||||||
]);
|
]);
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
@ -1097,7 +1134,8 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
@ -183,6 +183,43 @@ pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] {
|
|||||||
[x, y, z]
|
[x, y, z]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if the field match one of the faceted fields.
|
||||||
|
/// See the function [`is_faceted_by`] below to see what “matching” means.
|
||||||
|
pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator<Item = impl AsRef<str>>) -> bool {
|
||||||
|
faceted_fields.into_iter().find(|facet| is_faceted_by(field, facet.as_ref())).is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if the field match the facet.
|
||||||
|
/// ```
|
||||||
|
/// use milli::is_faceted_by;
|
||||||
|
/// // -- the valid basics
|
||||||
|
/// assert!(is_faceted_by("animaux", "animaux"));
|
||||||
|
/// assert!(is_faceted_by("animaux.chien", "animaux"));
|
||||||
|
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux"));
|
||||||
|
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien"));
|
||||||
|
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois"));
|
||||||
|
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure"));
|
||||||
|
/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure.couleur"));
|
||||||
|
///
|
||||||
|
/// // -- the wrongs
|
||||||
|
/// assert!(!is_faceted_by("chien", "chat"));
|
||||||
|
/// assert!(!is_faceted_by("animaux", "animaux.chien"));
|
||||||
|
/// assert!(!is_faceted_by("animaux.chien", "animaux.chat"));
|
||||||
|
///
|
||||||
|
/// // -- the strange edge cases
|
||||||
|
/// assert!(!is_faceted_by("animaux.chien", "anima"));
|
||||||
|
/// assert!(!is_faceted_by("animaux.chien", "animau"));
|
||||||
|
/// assert!(!is_faceted_by("animaux.chien", "animaux."));
|
||||||
|
/// assert!(!is_faceted_by("animaux.chien", "animaux.c"));
|
||||||
|
/// assert!(!is_faceted_by("animaux.chien", "animaux.ch"));
|
||||||
|
/// assert!(!is_faceted_by("animaux.chien", "animaux.chi"));
|
||||||
|
/// assert!(!is_faceted_by("animaux.chien", "animaux.chie"));
|
||||||
|
/// ```
|
||||||
|
pub fn is_faceted_by(field: &str, facet: &str) -> bool {
|
||||||
|
field.starts_with(facet)
|
||||||
|
&& field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
@ -97,7 +97,8 @@ mod test {
|
|||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut addition = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ());
|
let mut addition =
|
||||||
|
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
|
|
||||||
let reader =
|
let reader =
|
||||||
crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
|
crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
|
||||||
|
@ -220,9 +220,13 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
pub fn execute(&self) -> Result<BTreeMap<String, BTreeMap<String, u64>>> {
|
pub fn execute(&self) -> Result<BTreeMap<String, BTreeMap<String, u64>>> {
|
||||||
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||||
let filterable_fields = self.index.filterable_fields(self.rtxn)?;
|
let filterable_fields = self.index.filterable_fields(self.rtxn)?;
|
||||||
|
|
||||||
let fields = match self.facets {
|
let fields = match self.facets {
|
||||||
Some(ref facets) => {
|
Some(ref facets) => {
|
||||||
let invalid_fields: HashSet<_> = facets.difference(&filterable_fields).collect();
|
let invalid_fields: HashSet<_> = facets
|
||||||
|
.iter()
|
||||||
|
.filter(|facet| !crate::is_faceted(facet, &filterable_fields))
|
||||||
|
.collect();
|
||||||
if !invalid_fields.is_empty() {
|
if !invalid_fields.is_empty() {
|
||||||
return Err(UserError::InvalidFacetsDistribution {
|
return Err(UserError::InvalidFacetsDistribution {
|
||||||
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
|
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
|
||||||
@ -236,10 +240,12 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let mut distribution = BTreeMap::new();
|
let mut distribution = BTreeMap::new();
|
||||||
for name in fields {
|
for (fid, name) in fields_ids_map.iter() {
|
||||||
if let Some(fid) = fields_ids_map.id(&name) {
|
if crate::is_faceted(name, &fields) {
|
||||||
let values = self.facet_values(fid)?;
|
let values = self.facet_values(fid)?;
|
||||||
distribution.insert(name, values);
|
if !values.is_empty() {
|
||||||
|
distribution.insert(name.to_string(), values);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -353,7 +353,8 @@ impl<'a> Filter<'a> {
|
|||||||
match &self.condition {
|
match &self.condition {
|
||||||
FilterCondition::Condition { fid, op } => {
|
FilterCondition::Condition { fid, op } => {
|
||||||
let filterable_fields = index.filterable_fields(rtxn)?;
|
let filterable_fields = index.filterable_fields(rtxn)?;
|
||||||
if filterable_fields.contains(fid.value()) {
|
|
||||||
|
if crate::is_faceted(fid.value(), &filterable_fields) {
|
||||||
let field_ids_map = index.fields_ids_map(rtxn)?;
|
let field_ids_map = index.fields_ids_map(rtxn)?;
|
||||||
if let Some(fid) = field_ids_map.id(fid.value()) {
|
if let Some(fid) = field_ids_map.id(fid.value()) {
|
||||||
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op)
|
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op)
|
||||||
@ -549,7 +550,6 @@ mod tests {
|
|||||||
Filter::from_str("channel = gotaga AND (timestamp = 44 OR channel != ponce)")
|
Filter::from_str("channel = gotaga AND (timestamp = 44 OR channel != ponce)")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
println!("\nExpecting: {:#?}\nGot: {:#?}\n", expected, condition);
|
|
||||||
assert_eq!(condition, expected);
|
assert_eq!(condition, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,7 +159,7 @@ impl<'a> Search<'a> {
|
|||||||
let sortable_fields = self.index.sortable_fields(self.rtxn)?;
|
let sortable_fields = self.index.sortable_fields(self.rtxn)?;
|
||||||
for asc_desc in sort_criteria {
|
for asc_desc in sort_criteria {
|
||||||
match asc_desc.member() {
|
match asc_desc.member() {
|
||||||
Member::Field(ref field) if !sortable_fields.contains(field) => {
|
Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => {
|
||||||
return Err(UserError::InvalidSortableAttribute {
|
return Err(UserError::InvalidSortableAttribute {
|
||||||
field: field.to_string(),
|
field: field.to_string(),
|
||||||
valid_fields: sortable_fields.into_iter().collect(),
|
valid_fields: sortable_fields.into_iter().collect(),
|
||||||
|
@ -98,7 +98,8 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -110,7 +111,8 @@ mod tests {
|
|||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 5);
|
// the value is 7 because there is `[id, name, age, country, _geo, _geo.lng, _geo.lat]`
|
||||||
|
assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 7);
|
||||||
|
|
||||||
assert!(index.words_fst(&rtxn).unwrap().is_empty());
|
assert!(index.words_fst(&rtxn).unwrap().is_empty());
|
||||||
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
|
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
|
||||||
|
@ -647,7 +647,8 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -681,7 +682,8 @@ mod tests {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -733,7 +735,8 @@ mod tests {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -790,7 +793,8 @@ mod tests {
|
|||||||
|
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@ use std::fs::File;
|
|||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
use concat_arrays::concat_arrays;
|
use concat_arrays::concat_arrays;
|
||||||
use serde_json::Value;
|
|
||||||
|
|
||||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||||
use crate::{FieldId, InternalError, Result, UserError};
|
use crate::{FieldId, InternalError, Result, UserError};
|
||||||
@ -14,7 +13,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
|||||||
obkv_documents: grenad::Reader<R>,
|
obkv_documents: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
geo_field_id: FieldId,
|
(lat_fid, lng_fid): (FieldId, FieldId),
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
@ -25,22 +24,18 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
|||||||
let mut cursor = obkv_documents.into_cursor()?;
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||||
let obkv = obkv::KvReader::new(value);
|
let obkv = obkv::KvReader::new(value);
|
||||||
let point: Value = match obkv.get(geo_field_id) {
|
let (lat, lng) = obkv.get(lat_fid).zip(obkv.get(lng_fid)).ok_or_else(|| {
|
||||||
Some(point) => serde_json::from_slice(point).map_err(InternalError::SerdeJson)?,
|
|
||||||
None => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) {
|
|
||||||
// this will create an array of 16 bytes (two 8 bytes floats)
|
|
||||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
|
||||||
writer.insert(docid_bytes, bytes)?;
|
|
||||||
} else {
|
|
||||||
// All document must have a primary key so we can unwrap safely here
|
|
||||||
let primary_key = obkv.get(primary_key_id).unwrap();
|
let primary_key = obkv.get(primary_key_id).unwrap();
|
||||||
let primary_key =
|
let primary_key = serde_json::from_slice(primary_key).unwrap();
|
||||||
serde_json::from_slice(primary_key).map_err(InternalError::SerdeJson)?;
|
UserError::InvalidGeoField { document_id: primary_key }
|
||||||
Err(UserError::InvalidGeoField { document_id: primary_key, object: point })?
|
})?;
|
||||||
}
|
let (lat, lng): (f64, f64) = (
|
||||||
|
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||||
|
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||||
|
);
|
||||||
|
|
||||||
|
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||||
|
writer.insert(docid_bytes, bytes)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(writer_into_reader(writer)?)
|
Ok(writer_into_reader(writer)?)
|
||||||
|
@ -34,28 +34,36 @@ use crate::{FieldId, Result};
|
|||||||
/// Extract data for each databases from obkv documents in parallel.
|
/// Extract data for each databases from obkv documents in parallel.
|
||||||
/// Send data in grenad file over provided Sender.
|
/// Send data in grenad file over provided Sender.
|
||||||
pub(crate) fn data_from_obkv_documents(
|
pub(crate) fn data_from_obkv_documents(
|
||||||
obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
||||||
|
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
searchable_fields: Option<HashSet<FieldId>>,
|
searchable_fields: Option<HashSet<FieldId>>,
|
||||||
faceted_fields: HashSet<FieldId>,
|
faceted_fields: HashSet<FieldId>,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
geo_field_id: Option<FieldId>,
|
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||||
stop_words: Option<fst::Set<&[u8]>>,
|
stop_words: Option<fst::Set<&[u8]>>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
exact_attributes: HashSet<FieldId>,
|
exact_attributes: HashSet<FieldId>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks
|
original_obkv_chunks
|
||||||
.par_bridge()
|
.par_bridge()
|
||||||
.map(|result| {
|
.map(|original_documents_chunk| {
|
||||||
extract_documents_data(
|
send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone())
|
||||||
result,
|
})
|
||||||
|
.collect::<Result<()>>()?;
|
||||||
|
|
||||||
|
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks
|
||||||
|
.par_bridge()
|
||||||
|
.map(|flattened_obkv_chunks| {
|
||||||
|
send_and_extract_flattened_documents_data(
|
||||||
|
flattened_obkv_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
&searchable_fields,
|
&searchable_fields,
|
||||||
&faceted_fields,
|
&faceted_fields,
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
geo_field_id,
|
geo_fields_ids,
|
||||||
&stop_words,
|
&stop_words,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
)
|
)
|
||||||
@ -170,36 +178,48 @@ fn spawn_extraction_task<FE, FS, M>(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract chuncked data and send it into lmdb_writer_sx sender:
|
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||||
/// - documents
|
/// - documents
|
||||||
|
fn send_original_documents_data(
|
||||||
|
original_documents_chunk: Result<grenad::Reader<File>>,
|
||||||
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let original_documents_chunk =
|
||||||
|
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||||
|
|
||||||
|
// TODO: create a custom internal error
|
||||||
|
lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||||
/// - documents_ids
|
/// - documents_ids
|
||||||
/// - docid_word_positions
|
/// - docid_word_positions
|
||||||
/// - docid_fid_facet_numbers
|
/// - docid_fid_facet_numbers
|
||||||
/// - docid_fid_facet_strings
|
/// - docid_fid_facet_strings
|
||||||
fn extract_documents_data(
|
fn send_and_extract_flattened_documents_data(
|
||||||
documents_chunk: Result<grenad::Reader<File>>,
|
flattened_documents_chunk: Result<grenad::Reader<File>>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
faceted_fields: &HashSet<FieldId>,
|
faceted_fields: &HashSet<FieldId>,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
geo_field_id: Option<FieldId>,
|
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||||
stop_words: &Option<fst::Set<&[u8]>>,
|
stop_words: &Option<fst::Set<&[u8]>>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(
|
) -> Result<(
|
||||||
grenad::Reader<CursorClonableMmap>,
|
grenad::Reader<CursorClonableMmap>,
|
||||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||||
)> {
|
)> {
|
||||||
let documents_chunk = documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
let flattened_documents_chunk =
|
||||||
|
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||||
|
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone())));
|
if let Some(geo_fields_ids) = geo_fields_ids {
|
||||||
|
let documents_chunk_cloned = flattened_documents_chunk.clone();
|
||||||
if let Some(geo_field_id) = geo_field_id {
|
|
||||||
let documents_chunk_cloned = documents_chunk.clone();
|
|
||||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
let result =
|
let result =
|
||||||
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_field_id);
|
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids);
|
||||||
let _ = match result {
|
let _ = match result {
|
||||||
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
|
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
|
||||||
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
||||||
@ -211,7 +231,7 @@ fn extract_documents_data(
|
|||||||
rayon::join(
|
rayon::join(
|
||||||
|| {
|
|| {
|
||||||
let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions(
|
let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions(
|
||||||
documents_chunk.clone(),
|
flattened_documents_chunk.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
searchable_fields,
|
searchable_fields,
|
||||||
stop_words.as_ref(),
|
stop_words.as_ref(),
|
||||||
@ -232,7 +252,7 @@ fn extract_documents_data(
|
|||||||
|| {
|
|| {
|
||||||
let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) =
|
let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) =
|
||||||
extract_fid_docid_facet_values(
|
extract_fid_docid_facet_values(
|
||||||
documents_chunk.clone(),
|
flattened_documents_chunk.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
faceted_fields,
|
faceted_fields,
|
||||||
)?;
|
)?;
|
||||||
|
@ -30,7 +30,7 @@ use crate::update::{
|
|||||||
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
|
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
|
||||||
WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
|
WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
|
||||||
};
|
};
|
||||||
use crate::{Index, Result, RoaringBitmapCodec};
|
use crate::{Index, Result, RoaringBitmapCodec, UserError};
|
||||||
|
|
||||||
static MERGED_DATABASE_COUNT: usize = 7;
|
static MERGED_DATABASE_COUNT: usize = 7;
|
||||||
static PREFIX_DATABASE_COUNT: usize = 5;
|
static PREFIX_DATABASE_COUNT: usize = 5;
|
||||||
@ -94,15 +94,16 @@ where
|
|||||||
indexer_config: &'a IndexerConfig,
|
indexer_config: &'a IndexerConfig,
|
||||||
config: IndexDocumentsConfig,
|
config: IndexDocumentsConfig,
|
||||||
progress: F,
|
progress: F,
|
||||||
) -> IndexDocuments<'t, 'u, 'i, 'a, F> {
|
) -> Result<IndexDocuments<'t, 'u, 'i, 'a, F>> {
|
||||||
let transform = Some(Transform::new(
|
let transform = Some(Transform::new(
|
||||||
|
wtxn,
|
||||||
&index,
|
&index,
|
||||||
indexer_config,
|
indexer_config,
|
||||||
config.update_method,
|
config.update_method,
|
||||||
config.autogenerate_docids,
|
config.autogenerate_docids,
|
||||||
));
|
)?);
|
||||||
|
|
||||||
IndexDocuments {
|
Ok(IndexDocuments {
|
||||||
transform,
|
transform,
|
||||||
config,
|
config,
|
||||||
indexer_config,
|
indexer_config,
|
||||||
@ -110,7 +111,7 @@ where
|
|||||||
wtxn,
|
wtxn,
|
||||||
index,
|
index,
|
||||||
added_documents: 0,
|
added_documents: 0,
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Adds a batch of documents to the current builder.
|
/// Adds a batch of documents to the current builder.
|
||||||
@ -151,6 +152,10 @@ where
|
|||||||
.take()
|
.take()
|
||||||
.expect("Invalid document addition state")
|
.expect("Invalid document addition state")
|
||||||
.output_from_sorter(self.wtxn, &self.progress)?;
|
.output_from_sorter(self.wtxn, &self.progress)?;
|
||||||
|
|
||||||
|
let new_facets = output.compute_real_facets(self.wtxn, self.index)?;
|
||||||
|
self.index.put_faceted_fields(self.wtxn, &new_facets)?;
|
||||||
|
|
||||||
let indexed_documents = output.documents_count as u64;
|
let indexed_documents = output.documents_count as u64;
|
||||||
let number_of_documents = self.execute_raw(output)?;
|
let number_of_documents = self.execute_raw(output)?;
|
||||||
|
|
||||||
@ -171,7 +176,8 @@ where
|
|||||||
new_documents_ids,
|
new_documents_ids,
|
||||||
replaced_documents_ids,
|
replaced_documents_ids,
|
||||||
documents_count,
|
documents_count,
|
||||||
documents_file,
|
original_documents,
|
||||||
|
flattened_documents,
|
||||||
} = output;
|
} = output;
|
||||||
|
|
||||||
// The fields_ids_map is put back to the store now so the rest of the transaction sees an
|
// The fields_ids_map is put back to the store now so the rest of the transaction sees an
|
||||||
@ -197,7 +203,8 @@ where
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let documents_file = grenad::Reader::new(documents_file)?;
|
let original_documents = grenad::Reader::new(original_documents)?;
|
||||||
|
let flattened_documents = grenad::Reader::new(flattened_documents)?;
|
||||||
|
|
||||||
// create LMDB writer channel
|
// create LMDB writer channel
|
||||||
let (lmdb_writer_sx, lmdb_writer_rx): (
|
let (lmdb_writer_sx, lmdb_writer_rx): (
|
||||||
@ -213,13 +220,20 @@ where
|
|||||||
self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
|
self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
|
||||||
// get filterable fields for facet databases
|
// get filterable fields for facet databases
|
||||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||||
// get the fid of the `_geo` field.
|
// get the fid of the `_geo.lat` and `_geo.lng` fields.
|
||||||
let geo_field_id = match self.index.fields_ids_map(self.wtxn)?.id("_geo") {
|
let geo_fields_ids = match self.index.fields_ids_map(self.wtxn)?.id("_geo") {
|
||||||
Some(gfid) => {
|
Some(gfid) => {
|
||||||
let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
|
let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
|
||||||
let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
|
let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
|
||||||
|
// if `_geo` is faceted then we get the `lat` and `lng`
|
||||||
if is_sortable || is_filterable {
|
if is_sortable || is_filterable {
|
||||||
Some(gfid)
|
let field_ids = self
|
||||||
|
.index
|
||||||
|
.fields_ids_map(self.wtxn)?
|
||||||
|
.insert("_geo.lat")
|
||||||
|
.zip(self.index.fields_ids_map(self.wtxn)?.insert("_geo.lng"))
|
||||||
|
.ok_or(UserError::AttributeLimitReached)?;
|
||||||
|
Some(field_ids)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
@ -239,28 +253,38 @@ where
|
|||||||
max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
|
max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
|
||||||
};
|
};
|
||||||
|
|
||||||
// split obkv file into several chuncks
|
// split obkv file into several chunks
|
||||||
let chunk_iter = grenad_obkv_into_chunks(
|
let original_chunk_iter = grenad_obkv_into_chunks(
|
||||||
documents_file,
|
original_documents,
|
||||||
params.clone(),
|
params.clone(),
|
||||||
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB
|
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB
|
||||||
);
|
);
|
||||||
|
|
||||||
let result = chunk_iter.map(|chunk_iter| {
|
// split obkv file into several chunks
|
||||||
// extract all databases from the chunked obkv douments
|
let flattened_chunk_iter = grenad_obkv_into_chunks(
|
||||||
extract::data_from_obkv_documents(
|
flattened_documents,
|
||||||
chunk_iter,
|
params.clone(),
|
||||||
params,
|
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB
|
||||||
lmdb_writer_sx.clone(),
|
);
|
||||||
searchable_fields,
|
|
||||||
faceted_fields,
|
let result = original_chunk_iter
|
||||||
primary_key_id,
|
.and_then(|original_chunk_iter| Ok((original_chunk_iter, flattened_chunk_iter?)))
|
||||||
geo_field_id,
|
.map(|(original_chunk, flattened_chunk)| {
|
||||||
stop_words,
|
// extract all databases from the chunked obkv douments
|
||||||
self.indexer_config.max_positions_per_attributes,
|
extract::data_from_obkv_documents(
|
||||||
exact_attributes,
|
original_chunk,
|
||||||
)
|
flattened_chunk,
|
||||||
});
|
params,
|
||||||
|
lmdb_writer_sx.clone(),
|
||||||
|
searchable_fields,
|
||||||
|
faceted_fields,
|
||||||
|
primary_key_id,
|
||||||
|
geo_fields_ids,
|
||||||
|
stop_words,
|
||||||
|
self.indexer_config.max_positions_per_attributes,
|
||||||
|
exact_attributes,
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
if let Err(e) = result {
|
if let Err(e) = result {
|
||||||
let _ = lmdb_writer_sx.send(Err(e));
|
let _ = lmdb_writer_sx.send(Err(e));
|
||||||
@ -550,6 +574,7 @@ mod tests {
|
|||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
|
use maplit::hashset;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::documents::DocumentBatchBuilder;
|
use crate::documents::DocumentBatchBuilder;
|
||||||
@ -574,7 +599,8 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -589,7 +615,8 @@ mod tests {
|
|||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let content = documents!([ { "id": 1, "name": "updated kevin" } ]);
|
let content = documents!([ { "id": 1, "name": "updated kevin" } ]);
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -607,7 +634,8 @@ mod tests {
|
|||||||
{ "id": 2, "name": "updated kevina" },
|
{ "id": 2, "name": "updated kevina" },
|
||||||
{ "id": 3, "name": "updated benoit" }
|
{ "id": 3, "name": "updated benoit" }
|
||||||
]);
|
]);
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -639,7 +667,8 @@ mod tests {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -665,7 +694,8 @@ mod tests {
|
|||||||
// Second we send 1 document with id 1, to force it to be merged with the previous one.
|
// Second we send 1 document with id 1, to force it to be merged with the previous one.
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let content = documents!([ { "id": 1, "age": 25 } ]);
|
let content = documents!([ { "id": 1, "age": 25 } ]);
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -706,7 +736,8 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
assert!(builder.add_documents(content).is_err());
|
assert!(builder.add_documents(content).is_err());
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -735,7 +766,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -753,7 +785,8 @@ mod tests {
|
|||||||
// Second we send 1 document with the generated uuid, to erase the previous ones.
|
// Second we send 1 document with the generated uuid, to erase the previous ones.
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]);
|
let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]);
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -793,7 +826,8 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -809,7 +843,8 @@ mod tests {
|
|||||||
let content = documents!([ { "name": "new kevin" } ]);
|
let content = documents!([ { "name": "new kevin" } ]);
|
||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -833,7 +868,8 @@ mod tests {
|
|||||||
let content = documents!([]);
|
let content = documents!([]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -859,7 +895,8 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
assert!(builder.add_documents(content).is_err());
|
assert!(builder.add_documents(content).is_err());
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
@ -867,7 +904,8 @@ mod tests {
|
|||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
// There is a space in the document id.
|
// There is a space in the document id.
|
||||||
let content = documents!([ { "id": 32, "name": "kevin" } ]);
|
let content = documents!([ { "id": 32, "name": "kevin" } ]);
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -895,7 +933,8 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -912,7 +951,7 @@ mod tests {
|
|||||||
assert_eq!(result.documents_ids, vec![1]);
|
assert_eq!(result.documents_ids, vec![1]);
|
||||||
|
|
||||||
// Search for a sub array sub object key
|
// Search for a sub array sub object key
|
||||||
let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap();
|
let result = index.search(&rtxn).query(r#""amazing""#).execute().unwrap();
|
||||||
assert_eq!(result.documents_ids, vec![2]);
|
assert_eq!(result.documents_ids, vec![2]);
|
||||||
|
|
||||||
drop(rtxn);
|
drop(rtxn);
|
||||||
@ -940,7 +979,8 @@ mod tests {
|
|||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -950,7 +990,8 @@ mod tests {
|
|||||||
update_method: IndexDocumentsMethod::UpdateDocuments,
|
update_method: IndexDocumentsMethod::UpdateDocuments,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
let documents = documents!([
|
let documents = documents!([
|
||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
@ -981,7 +1022,8 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -1000,7 +1042,8 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||||
@ -1011,7 +1054,8 @@ mod tests {
|
|||||||
]);
|
]);
|
||||||
|
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -1046,7 +1090,8 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -1080,7 +1125,8 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -1137,13 +1183,333 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn index_documents_with_nested_fields() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let content = documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"title": "The zeroth document",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "The first document",
|
||||||
|
"nested": {
|
||||||
|
"object": "field",
|
||||||
|
"machin": "bidule",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "The second document",
|
||||||
|
"nested": [
|
||||||
|
"array",
|
||||||
|
{
|
||||||
|
"object": "field",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"prout": "truc",
|
||||||
|
"machin": "lol",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "The third document",
|
||||||
|
"nested": "I lied",
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
|
builder.add_documents(content).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||||
|
|
||||||
|
let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")];
|
||||||
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
|
||||||
|
let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin"));
|
||||||
|
builder.set_filterable_fields(faceted_fields);
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||||
|
assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin")));
|
||||||
|
|
||||||
|
// testing the simple query search
|
||||||
|
let mut search = crate::Search::new(&rtxn, &index);
|
||||||
|
search.query("document");
|
||||||
|
search.authorize_typos(true);
|
||||||
|
search.optional_words(true);
|
||||||
|
// all documents should be returned
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids.len(), 4);
|
||||||
|
|
||||||
|
search.query("zeroth");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![0]);
|
||||||
|
search.query("first");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![1]);
|
||||||
|
search.query("second");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![2]);
|
||||||
|
search.query("third");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![3]);
|
||||||
|
|
||||||
|
search.query("field");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![1, 2]);
|
||||||
|
|
||||||
|
search.query("lol");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![2]);
|
||||||
|
|
||||||
|
search.query("object");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert!(documents_ids.is_empty());
|
||||||
|
|
||||||
|
search.query("array");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert!(documents_ids.is_empty()); // nested is not searchable
|
||||||
|
|
||||||
|
search.query("lied");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert!(documents_ids.is_empty()); // nested is not searchable
|
||||||
|
|
||||||
|
// testing the filters
|
||||||
|
let mut search = crate::Search::new(&rtxn, &index);
|
||||||
|
search.filter(crate::Filter::from_str(r#"title = "The first document""#).unwrap().unwrap());
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![1]);
|
||||||
|
|
||||||
|
search.filter(crate::Filter::from_str(r#"nested.object = field"#).unwrap().unwrap());
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![1, 2]);
|
||||||
|
|
||||||
|
search.filter(crate::Filter::from_str(r#"nested.machin = bidule"#).unwrap().unwrap());
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![1]);
|
||||||
|
|
||||||
|
search.filter(crate::Filter::from_str(r#"nested = array"#).unwrap().unwrap());
|
||||||
|
let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable
|
||||||
|
assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_))));
|
||||||
|
|
||||||
|
search.filter(crate::Filter::from_str(r#"nested = "I lied""#).unwrap().unwrap());
|
||||||
|
let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable
|
||||||
|
assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_))));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn index_documents_with_nested_primary_key() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||||
|
builder.set_primary_key("nested.id".to_owned());
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let content = documents!([
|
||||||
|
{
|
||||||
|
"nested": {
|
||||||
|
"id": 0,
|
||||||
|
},
|
||||||
|
"title": "The zeroth document",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nested": {
|
||||||
|
"id": 1,
|
||||||
|
},
|
||||||
|
"title": "The first document",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nested": {
|
||||||
|
"id": 2,
|
||||||
|
},
|
||||||
|
"title": "The second document",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nested.id": 3,
|
||||||
|
"title": "The third document",
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
|
builder.add_documents(content).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
// testing the simple query search
|
||||||
|
let mut search = crate::Search::new(&rtxn, &index);
|
||||||
|
search.query("document");
|
||||||
|
search.authorize_typos(true);
|
||||||
|
search.optional_words(true);
|
||||||
|
// all documents should be returned
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids.len(), 4);
|
||||||
|
|
||||||
|
search.query("zeroth");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![0]);
|
||||||
|
search.query("first");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![1]);
|
||||||
|
search.query("second");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![2]);
|
||||||
|
search.query("third");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_facets_generation() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let content = documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"dog": {
|
||||||
|
"race": {
|
||||||
|
"bernese mountain": "zeroth",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"dog.race": {
|
||||||
|
"bernese mountain": "first",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"dog.race.bernese mountain": "second",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"dog": {
|
||||||
|
"race.bernese mountain": "third"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
// index the documents
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
|
builder.add_documents(content).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// ---- ADD THE SETTING TO TEST THE FILTERABLE
|
||||||
|
|
||||||
|
// add the settings
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||||
|
|
||||||
|
builder.set_filterable_fields(hashset!(String::from("dog")));
|
||||||
|
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let hidden = index.faceted_fields(&rtxn).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain")));
|
||||||
|
|
||||||
|
for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] {
|
||||||
|
let mut search = crate::Search::new(&rtxn, &index);
|
||||||
|
let filter = format!(r#""dog.race.bernese mountain" = {s}"#);
|
||||||
|
search.filter(crate::Filter::from_str(&filter).unwrap().unwrap());
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- RESET THE SETTINGS
|
||||||
|
|
||||||
|
// update the settings
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||||
|
|
||||||
|
builder.reset_filterable_fields();
|
||||||
|
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(facets, hashset!());
|
||||||
|
|
||||||
|
// ---- UPDATE THE SETTINGS TO TEST THE SORTABLE
|
||||||
|
|
||||||
|
// update the settings
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||||
|
|
||||||
|
builder.set_sortable_fields(hashset!(S("dog.race")));
|
||||||
|
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain")));
|
||||||
|
|
||||||
|
let mut search = crate::Search::new(&rtxn, &index);
|
||||||
|
search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S(
|
||||||
|
"dog.race.bernese mountain",
|
||||||
|
)))]);
|
||||||
|
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
assert_eq!(documents_ids, vec![1, 2, 3, 0]);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn index_2_times_documents_split_by_zero_document_indexation() {
|
fn index_2_times_documents_split_by_zero_document_indexation() {
|
||||||
let path = tempfile::tempdir().unwrap();
|
let path = tempfile::tempdir().unwrap();
|
||||||
@ -1162,7 +1528,8 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1178,7 +1545,8 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1199,7 +1567,8 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1226,7 +1595,8 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
@ -1,24 +1,27 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::btree_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{Read, Seek, SeekFrom};
|
use std::io::{Read, Seek, SeekFrom};
|
||||||
use std::time::Instant;
|
|
||||||
|
|
||||||
|
use byteorder::ReadBytesExt;
|
||||||
|
use fxhash::FxHashMap;
|
||||||
|
use heed::RoTxn;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use log::info;
|
use obkv::{KvReader, KvWriter};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
|
||||||
create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn,
|
|
||||||
};
|
|
||||||
use super::{IndexDocumentsMethod, IndexerConfig};
|
use super::{IndexDocumentsMethod, IndexerConfig};
|
||||||
use crate::documents::{DocumentBatchReader, DocumentsBatchIndex};
|
use crate::documents::{DocumentBatchReader, DocumentsBatchIndex};
|
||||||
use crate::error::{Error, InternalError, UserError};
|
use crate::error::{Error, InternalError, UserError};
|
||||||
use crate::index::db_name;
|
use crate::index::db_name;
|
||||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||||
use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32};
|
use crate::{
|
||||||
|
ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
|
||||||
|
Result, BEU32,
|
||||||
|
};
|
||||||
|
|
||||||
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
|
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
|
||||||
|
|
||||||
@ -30,7 +33,8 @@ pub struct TransformOutput {
|
|||||||
pub new_documents_ids: RoaringBitmap,
|
pub new_documents_ids: RoaringBitmap,
|
||||||
pub replaced_documents_ids: RoaringBitmap,
|
pub replaced_documents_ids: RoaringBitmap,
|
||||||
pub documents_count: usize,
|
pub documents_count: usize,
|
||||||
pub documents_file: File,
|
pub original_documents: File,
|
||||||
|
pub flattened_documents: File,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract the external ids, deduplicate and compute the new internal documents ids
|
/// Extract the external ids, deduplicate and compute the new internal documents ids
|
||||||
@ -41,11 +45,17 @@ pub struct TransformOutput {
|
|||||||
/// containing all those documents.
|
/// containing all those documents.
|
||||||
pub struct Transform<'a, 'i> {
|
pub struct Transform<'a, 'i> {
|
||||||
pub index: &'i Index,
|
pub index: &'i Index,
|
||||||
|
fields_ids_map: FieldsIdsMap,
|
||||||
|
|
||||||
indexer_settings: &'a IndexerConfig,
|
indexer_settings: &'a IndexerConfig,
|
||||||
pub autogenerate_docids: bool,
|
pub autogenerate_docids: bool,
|
||||||
pub index_documents_method: IndexDocumentsMethod,
|
pub index_documents_method: IndexDocumentsMethod,
|
||||||
|
|
||||||
sorter: grenad::Sorter<MergeFn>,
|
original_sorter: grenad::Sorter<MergeFn>,
|
||||||
|
flattened_sorter: grenad::Sorter<MergeFn>,
|
||||||
|
replaced_documents_ids: RoaringBitmap,
|
||||||
|
new_documents_ids: RoaringBitmap,
|
||||||
|
new_external_documents_ids_builder: FxHashMap<Vec<u8>, u64>,
|
||||||
documents_count: usize,
|
documents_count: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,6 +82,9 @@ fn create_fields_mapping(
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Look for a key containing the [DEFAULT_PRIMARY_KEY_NAME] in the fields.
|
||||||
|
/// It doesn't look in the subfield because we don't want to enable the
|
||||||
|
/// primary key inference on nested objects.
|
||||||
fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> {
|
fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> {
|
||||||
index
|
index
|
||||||
.iter()
|
.iter()
|
||||||
@ -83,11 +96,12 @@ fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> {
|
|||||||
|
|
||||||
impl<'a, 'i> Transform<'a, 'i> {
|
impl<'a, 'i> Transform<'a, 'i> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
|
wtxn: &mut heed::RwTxn,
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
indexer_settings: &'a IndexerConfig,
|
indexer_settings: &'a IndexerConfig,
|
||||||
index_documents_method: IndexDocumentsMethod,
|
index_documents_method: IndexDocumentsMethod,
|
||||||
autogenerate_docids: bool,
|
autogenerate_docids: bool,
|
||||||
) -> Self {
|
) -> Result<Self> {
|
||||||
// We must choose the appropriate merge function for when two or more documents
|
// We must choose the appropriate merge function for when two or more documents
|
||||||
// with the same user id must be merged or fully replaced in the same batch.
|
// with the same user id must be merged or fully replaced in the same batch.
|
||||||
let merge_function = match index_documents_method {
|
let merge_function = match index_documents_method {
|
||||||
@ -96,22 +110,36 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// We initialize the sorter with the user indexing settings.
|
// We initialize the sorter with the user indexing settings.
|
||||||
let sorter = create_sorter(
|
let original_sorter = create_sorter(
|
||||||
merge_function,
|
merge_function,
|
||||||
indexer_settings.chunk_compression_type,
|
indexer_settings.chunk_compression_type,
|
||||||
indexer_settings.chunk_compression_level,
|
indexer_settings.chunk_compression_level,
|
||||||
indexer_settings.max_nb_chunks,
|
indexer_settings.max_nb_chunks,
|
||||||
indexer_settings.max_memory,
|
indexer_settings.max_memory.map(|mem| mem / 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
Transform {
|
// We initialize the sorter with the user indexing settings.
|
||||||
|
let flattened_sorter = create_sorter(
|
||||||
|
merge_function,
|
||||||
|
indexer_settings.chunk_compression_type,
|
||||||
|
indexer_settings.chunk_compression_level,
|
||||||
|
indexer_settings.max_nb_chunks,
|
||||||
|
indexer_settings.max_memory.map(|mem| mem / 2),
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(Transform {
|
||||||
index,
|
index,
|
||||||
|
fields_ids_map: index.fields_ids_map(wtxn)?,
|
||||||
indexer_settings,
|
indexer_settings,
|
||||||
autogenerate_docids,
|
autogenerate_docids,
|
||||||
sorter,
|
original_sorter,
|
||||||
documents_count: 0,
|
flattened_sorter,
|
||||||
index_documents_method,
|
index_documents_method,
|
||||||
}
|
replaced_documents_ids: RoaringBitmap::new(),
|
||||||
|
new_documents_ids: RoaringBitmap::new(),
|
||||||
|
new_external_documents_ids_builder: FxHashMap::default(),
|
||||||
|
documents_count: 0,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn read_documents<R, F>(
|
pub fn read_documents<R, F>(
|
||||||
@ -125,8 +153,11 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
F: Fn(UpdateIndexingStep) + Sync,
|
F: Fn(UpdateIndexingStep) + Sync,
|
||||||
{
|
{
|
||||||
let fields_index = reader.index();
|
let fields_index = reader.index();
|
||||||
let mut fields_ids_map = self.index.fields_ids_map(wtxn)?;
|
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||||
let mapping = create_fields_mapping(&mut fields_ids_map, fields_index)?;
|
let documents_ids = self.index.documents_ids(wtxn)?;
|
||||||
|
let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
|
||||||
|
|
||||||
|
let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
|
||||||
|
|
||||||
let alternative_name = self
|
let alternative_name = self
|
||||||
.index
|
.index
|
||||||
@ -136,15 +167,19 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
|
|
||||||
let (primary_key_id, primary_key_name) = compute_primary_key_pair(
|
let (primary_key_id, primary_key_name) = compute_primary_key_pair(
|
||||||
self.index.primary_key(wtxn)?,
|
self.index.primary_key(wtxn)?,
|
||||||
&mut fields_ids_map,
|
&mut self.fields_ids_map,
|
||||||
alternative_name,
|
alternative_name,
|
||||||
self.autogenerate_docids,
|
self.autogenerate_docids,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
let primary_key_id_nested = primary_key_name.contains('.');
|
||||||
|
|
||||||
|
let mut flattened_document = None;
|
||||||
let mut obkv_buffer = Vec::new();
|
let mut obkv_buffer = Vec::new();
|
||||||
|
let mut flattened_obkv_buffer = Vec::new();
|
||||||
let mut documents_count = 0;
|
let mut documents_count = 0;
|
||||||
let mut external_id_buffer = Vec::new();
|
let mut external_id_buffer = Vec::new();
|
||||||
let mut field_buffer: Vec<(u16, &[u8])> = Vec::new();
|
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
|
||||||
while let Some((addition_index, document)) = reader.next_document_with_index()? {
|
while let Some((addition_index, document)) = reader.next_document_with_index()? {
|
||||||
let mut field_buffer_cache = drop_and_reuse(field_buffer);
|
let mut field_buffer_cache = drop_and_reuse(field_buffer);
|
||||||
if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
||||||
@ -154,8 +189,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (k, v) in document.iter() {
|
for (k, v) in document.iter() {
|
||||||
let mapped_id = *mapping.get(&k).unwrap();
|
let mapped_id =
|
||||||
field_buffer_cache.push((mapped_id, v));
|
*mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?;
|
||||||
|
field_buffer_cache.push((mapped_id, Cow::from(v)));
|
||||||
}
|
}
|
||||||
|
|
||||||
// We need to make sure that every document has a primary key. After we have remapped
|
// We need to make sure that every document has a primary key. After we have remapped
|
||||||
@ -164,87 +200,125 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
// document. If none is found, and we were told to generate missing document ids, then
|
// document. If none is found, and we were told to generate missing document ids, then
|
||||||
// we create the missing field, and update the new document.
|
// we create the missing field, and update the new document.
|
||||||
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
|
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
|
||||||
let external_id =
|
let external_id = if primary_key_id_nested {
|
||||||
match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) {
|
let mut field_buffer_cache = field_buffer_cache.clone();
|
||||||
Some((_, bytes)) => {
|
self.flatten_from_field_mapping(
|
||||||
let value = match serde_json::from_slice(bytes).unwrap() {
|
&mapping,
|
||||||
Value::String(string) => match validate_document_id(&string) {
|
&document,
|
||||||
Some(s) if s.len() == string.len() => string,
|
&mut flattened_obkv_buffer,
|
||||||
Some(s) => s.to_string(),
|
&mut field_buffer_cache,
|
||||||
None => {
|
)?;
|
||||||
return Err(UserError::InvalidDocumentId {
|
flattened_document = Some(&flattened_obkv_buffer);
|
||||||
document_id: Value::String(string),
|
let document = KvReader::new(&flattened_obkv_buffer);
|
||||||
}
|
|
||||||
.into())
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Value::Number(number) => number.to_string(),
|
|
||||||
content => {
|
|
||||||
return Err(UserError::InvalidDocumentId {
|
|
||||||
document_id: content.clone(),
|
|
||||||
}
|
|
||||||
.into())
|
|
||||||
}
|
|
||||||
};
|
|
||||||
serde_json::to_writer(&mut external_id_buffer, &value).unwrap();
|
|
||||||
Cow::Owned(value)
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
if !self.autogenerate_docids {
|
|
||||||
let mut json = Map::new();
|
|
||||||
for (key, value) in document.iter() {
|
|
||||||
let key = addition_index.name(key).cloned();
|
|
||||||
let value = serde_json::from_slice::<Value>(&value).ok();
|
|
||||||
|
|
||||||
if let Some((k, v)) = key.zip(value) {
|
update_primary_key(
|
||||||
json.insert(k, v);
|
document,
|
||||||
}
|
&addition_index,
|
||||||
}
|
primary_key_id,
|
||||||
|
&primary_key_name,
|
||||||
return Err(UserError::MissingDocumentId {
|
&mut uuid_buffer,
|
||||||
primary_key: primary_key_name,
|
&mut field_buffer_cache,
|
||||||
document: json,
|
&mut external_id_buffer,
|
||||||
}
|
self.autogenerate_docids,
|
||||||
.into());
|
)?
|
||||||
}
|
} else {
|
||||||
|
update_primary_key(
|
||||||
let uuid =
|
document,
|
||||||
uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
|
&addition_index,
|
||||||
serde_json::to_writer(&mut external_id_buffer, &uuid).unwrap();
|
primary_key_id,
|
||||||
field_buffer_cache.push((primary_key_id, &external_id_buffer));
|
&primary_key_name,
|
||||||
Cow::Borrowed(&*uuid)
|
&mut uuid_buffer,
|
||||||
}
|
&mut field_buffer_cache,
|
||||||
};
|
&mut external_id_buffer,
|
||||||
|
self.autogenerate_docids,
|
||||||
|
)?
|
||||||
|
};
|
||||||
|
|
||||||
// Insertion in a obkv need to be done with keys ordered. For now they are ordered
|
// Insertion in a obkv need to be done with keys ordered. For now they are ordered
|
||||||
// according to the document addition key order, so we sort it according to the
|
// according to the document addition key order, so we sort it according to the
|
||||||
// fieldids map keys order.
|
// fieldids map keys order.
|
||||||
field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2));
|
field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2));
|
||||||
|
|
||||||
// The last step is to build the new obkv document, and insert it in the sorter.
|
// Build the new obkv document.
|
||||||
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
|
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
|
||||||
for (k, v) in field_buffer_cache.iter() {
|
for (k, v) in field_buffer_cache.iter() {
|
||||||
writer.insert(*k, v)?;
|
writer.insert(*k, v)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let (docid, should_insert_original_document) =
|
||||||
|
match external_documents_ids.get(&*external_id) {
|
||||||
|
// if the document is in the db but has already been inserted
|
||||||
|
// (ie: already exists in the list of replaced documents ids),
|
||||||
|
// we should not add the original document a second time.
|
||||||
|
Some(docid) => (docid, !self.replaced_documents_ids.contains(docid)),
|
||||||
|
None => {
|
||||||
|
// if the document has already been inserted in this
|
||||||
|
// batch we need to get its docid
|
||||||
|
match self
|
||||||
|
.new_external_documents_ids_builder
|
||||||
|
.entry(external_id.as_bytes().to_vec())
|
||||||
|
{
|
||||||
|
Entry::Occupied(entry) => (*entry.get() as u32, false),
|
||||||
|
// if the document has never been encountered we give it a new docid
|
||||||
|
// and push this new docid to the external documents ids builder
|
||||||
|
Entry::Vacant(entry) => {
|
||||||
|
let new_docid = available_documents_ids
|
||||||
|
.next()
|
||||||
|
.ok_or(UserError::DocumentLimitReached)?;
|
||||||
|
entry.insert(new_docid as u64);
|
||||||
|
(new_docid, false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if should_insert_original_document {
|
||||||
|
self.replaced_documents_ids.insert(docid);
|
||||||
|
|
||||||
|
let key = BEU32::new(docid);
|
||||||
|
let base_obkv = self
|
||||||
|
.index
|
||||||
|
.documents
|
||||||
|
.remap_data_type::<heed::types::ByteSlice>()
|
||||||
|
.get(wtxn, &key)?
|
||||||
|
.ok_or(InternalError::DatabaseMissingEntry {
|
||||||
|
db_name: db_name::DOCUMENTS,
|
||||||
|
key: None,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?;
|
||||||
|
let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?;
|
||||||
|
|
||||||
|
self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?;
|
||||||
|
} else {
|
||||||
|
self.new_documents_ids.insert(docid);
|
||||||
|
}
|
||||||
|
|
||||||
// We use the extracted/generated user id as the key for this document.
|
// We use the extracted/generated user id as the key for this document.
|
||||||
self.sorter.insert(&external_id.as_ref().as_bytes(), &obkv_buffer)?;
|
self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?;
|
||||||
documents_count += 1;
|
documents_count += 1;
|
||||||
|
|
||||||
|
if let Some(flatten) = flattened_document {
|
||||||
|
self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?;
|
||||||
|
} else {
|
||||||
|
let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?;
|
||||||
|
self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
||||||
documents_seen: documents_count,
|
documents_seen: documents_count,
|
||||||
});
|
});
|
||||||
|
|
||||||
obkv_buffer.clear();
|
|
||||||
field_buffer = drop_and_reuse(field_buffer_cache);
|
field_buffer = drop_and_reuse(field_buffer_cache);
|
||||||
external_id_buffer.clear();
|
external_id_buffer.clear();
|
||||||
|
obkv_buffer.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
||||||
documents_seen: documents_count,
|
documents_seen: documents_count,
|
||||||
});
|
});
|
||||||
|
|
||||||
self.index.put_fields_ids_map(wtxn, &fields_ids_map)?;
|
self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?;
|
||||||
self.index.put_primary_key(wtxn, &primary_key_name)?;
|
self.index.put_primary_key(wtxn, &primary_key_name)?;
|
||||||
self.documents_count += documents_count;
|
self.documents_count += documents_count;
|
||||||
// Now that we have a valid sorter that contains the user id and the obkv we
|
// Now that we have a valid sorter that contains the user id and the obkv we
|
||||||
@ -252,6 +326,87 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
Ok(documents_count)
|
Ok(documents_count)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Flatten a document from the fields ids map contained in self and insert the new
|
||||||
|
// created fields.
|
||||||
|
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Vec<u8>> {
|
||||||
|
let mut doc = serde_json::Map::new();
|
||||||
|
|
||||||
|
for (k, v) in obkv.iter() {
|
||||||
|
let key = self.fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||||
|
field_id: k,
|
||||||
|
process: "Flatten from fields ids map.",
|
||||||
|
})?;
|
||||||
|
let value = serde_json::from_slice::<serde_json::Value>(v)
|
||||||
|
.map_err(crate::error::InternalError::SerdeJson)?;
|
||||||
|
doc.insert(key.to_string(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
let flattened = flatten_serde_json::flatten(&doc);
|
||||||
|
|
||||||
|
// Once we have the flattened version we can convert it back to obkv and
|
||||||
|
// insert all the new generated fields_ids (if any) in the fields ids map.
|
||||||
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
|
let mut writer = KvWriter::new(&mut buffer);
|
||||||
|
let mut flattened: Vec<_> = flattened.into_iter().collect();
|
||||||
|
// we reorder the field to get all the known field first
|
||||||
|
flattened
|
||||||
|
.sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX));
|
||||||
|
|
||||||
|
for (key, value) in flattened {
|
||||||
|
let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
|
||||||
|
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||||
|
writer.insert(fid, &value)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(buffer)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flatten a document from a field mapping generated by [create_fields_mapping]
|
||||||
|
fn flatten_from_field_mapping(
|
||||||
|
&mut self,
|
||||||
|
mapping: &HashMap<FieldId, FieldId>,
|
||||||
|
obkv: &KvReader<FieldId>,
|
||||||
|
output_buffer: &mut Vec<u8>,
|
||||||
|
field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>,
|
||||||
|
) -> Result<()> {
|
||||||
|
// if the primary_key is nested we need to flatten the document before being able to do anything
|
||||||
|
let mut doc = serde_json::Map::new();
|
||||||
|
|
||||||
|
for (k, v) in obkv.iter() {
|
||||||
|
let key =
|
||||||
|
mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?;
|
||||||
|
let key = self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||||
|
field_id: *key,
|
||||||
|
process: "Flatten from field mapping.",
|
||||||
|
})?;
|
||||||
|
let value =
|
||||||
|
serde_json::from_slice::<serde_json::Value>(v).map_err(InternalError::SerdeJson)?;
|
||||||
|
doc.insert(key.to_string(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
let flattened = flatten_serde_json::flatten(&doc);
|
||||||
|
|
||||||
|
// Once we have the flattened version we can convert it back to obkv and
|
||||||
|
// insert all the new generated fields_ids (if any) in the fields ids map.
|
||||||
|
output_buffer.clear();
|
||||||
|
let mut writer = KvWriter::new(output_buffer);
|
||||||
|
let mut flattened: Vec<_> = flattened.into_iter().collect();
|
||||||
|
// we reorder the field to get all the known field first
|
||||||
|
flattened
|
||||||
|
.sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX));
|
||||||
|
|
||||||
|
for (key, value) in flattened {
|
||||||
|
let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
|
||||||
|
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||||
|
writer.insert(fid, &value)?;
|
||||||
|
if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() {
|
||||||
|
field_buffer_cache.push((fid, value.into()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Generate the `TransformOutput` based on the given sorter that can be generated from any
|
/// Generate the `TransformOutput` based on the given sorter that can be generated from any
|
||||||
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
|
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
|
||||||
/// id for the user side and the value must be an obkv where keys are valid fields ids.
|
/// id for the user side and the value must be an obkv where keys are valid fields ids.
|
||||||
@ -268,110 +423,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
.primary_key(&wtxn)?
|
.primary_key(&wtxn)?
|
||||||
.ok_or(Error::UserError(UserError::MissingPrimaryKey))?
|
.ok_or(Error::UserError(UserError::MissingPrimaryKey))?
|
||||||
.to_string();
|
.to_string();
|
||||||
let fields_ids_map = self.index.fields_ids_map(wtxn)?;
|
|
||||||
let approximate_number_of_documents = self.documents_count;
|
|
||||||
|
|
||||||
let mut external_documents_ids = self.index.external_documents_ids(wtxn).unwrap();
|
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||||
let documents_ids = self.index.documents_ids(wtxn)?;
|
|
||||||
let mut field_distribution = self.index.field_distribution(wtxn)?;
|
|
||||||
let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
|
|
||||||
|
|
||||||
// consume sorter, in order to free the internal allocation, before creating a new one.
|
|
||||||
let mut iter = self.sorter.into_stream_merger_iter()?;
|
|
||||||
|
|
||||||
// Once we have sort and deduplicated the documents we write them into a final file.
|
|
||||||
let mut final_sorter = create_sorter(
|
|
||||||
|_id, obkvs| {
|
|
||||||
if obkvs.len() == 1 {
|
|
||||||
Ok(obkvs[0].clone())
|
|
||||||
} else {
|
|
||||||
Err(InternalError::IndexingMergingKeys { process: "documents" }.into())
|
|
||||||
}
|
|
||||||
},
|
|
||||||
self.indexer_settings.chunk_compression_type,
|
|
||||||
self.indexer_settings.chunk_compression_level,
|
|
||||||
self.indexer_settings.max_nb_chunks,
|
|
||||||
self.indexer_settings.max_memory,
|
|
||||||
);
|
|
||||||
let mut new_external_documents_ids_builder = fst::MapBuilder::memory();
|
|
||||||
let mut replaced_documents_ids = RoaringBitmap::new();
|
|
||||||
let mut new_documents_ids = RoaringBitmap::new();
|
|
||||||
let mut obkv_buffer = Vec::new();
|
|
||||||
|
|
||||||
// While we write into final file we get or generate the internal documents ids.
|
|
||||||
let mut documents_count = 0;
|
|
||||||
while let Some((external_id, update_obkv)) = iter.next()? {
|
|
||||||
if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
|
||||||
progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
|
|
||||||
documents_seen: documents_count,
|
|
||||||
total_documents: approximate_number_of_documents,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let (docid, obkv) = match external_documents_ids.get(external_id) {
|
|
||||||
Some(docid) => {
|
|
||||||
// If we find the user id in the current external documents ids map
|
|
||||||
// we use it and insert it in the list of replaced documents.
|
|
||||||
replaced_documents_ids.insert(docid);
|
|
||||||
|
|
||||||
let key = BEU32::new(docid);
|
|
||||||
let base_obkv = self.index.documents.get(wtxn, &key)?.ok_or(
|
|
||||||
InternalError::DatabaseMissingEntry {
|
|
||||||
db_name: db_name::DOCUMENTS,
|
|
||||||
key: None,
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// we remove all the fields that were already counted
|
|
||||||
for (field_id, _) in base_obkv.iter() {
|
|
||||||
let field_name = fields_ids_map.name(field_id).unwrap();
|
|
||||||
if let Entry::Occupied(mut entry) =
|
|
||||||
field_distribution.entry(field_name.to_string())
|
|
||||||
{
|
|
||||||
match entry.get().checked_sub(1) {
|
|
||||||
Some(0) | None => entry.remove(),
|
|
||||||
Some(count) => entry.insert(count),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Depending on the update indexing method we will merge
|
|
||||||
// the document update with the current document or not.
|
|
||||||
match self.index_documents_method {
|
|
||||||
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
|
|
||||||
IndexDocumentsMethod::UpdateDocuments => {
|
|
||||||
let update_obkv = obkv::KvReader::new(update_obkv);
|
|
||||||
merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
|
|
||||||
(docid, obkv_buffer.as_slice())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
// If this user id is new we add it to the external documents ids map
|
|
||||||
// for new ids and into the list of new documents.
|
|
||||||
let new_docid =
|
|
||||||
available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?;
|
|
||||||
new_external_documents_ids_builder.insert(external_id, new_docid as u64)?;
|
|
||||||
new_documents_ids.insert(new_docid);
|
|
||||||
(new_docid, update_obkv)
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// We insert the document under the documents ids map into the final file.
|
|
||||||
final_sorter.insert(docid.to_be_bytes(), obkv)?;
|
|
||||||
documents_count += 1;
|
|
||||||
|
|
||||||
let reader = obkv::KvReader::new(obkv);
|
|
||||||
for (field_id, _) in reader.iter() {
|
|
||||||
let field_name = fields_ids_map.name(field_id).unwrap();
|
|
||||||
*field_distribution.entry(field_name.to_string()).or_default() += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
|
|
||||||
documents_seen: documents_count,
|
|
||||||
total_documents: documents_count,
|
|
||||||
});
|
|
||||||
|
|
||||||
// We create a final writer to write the new documents in order from the sorter.
|
// We create a final writer to write the new documents in order from the sorter.
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
@ -380,28 +433,103 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Once we have all the documents in the sorter, we write the documents
|
||||||
|
// in the writer. We also generate the field distribution.
|
||||||
|
let mut field_distribution = self.index.field_distribution(wtxn)?;
|
||||||
|
let mut iter = self.original_sorter.into_stream_merger_iter()?;
|
||||||
|
// used only for the callback
|
||||||
|
let mut documents_count = 0;
|
||||||
|
|
||||||
|
while let Some((key, val)) = iter.next()? {
|
||||||
|
// send a callback to show at which step we are
|
||||||
|
documents_count += 1;
|
||||||
|
progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
|
||||||
|
documents_seen: documents_count,
|
||||||
|
total_documents: self.documents_count,
|
||||||
|
});
|
||||||
|
|
||||||
|
let u32_key = key.clone().read_u32::<byteorder::BigEndian>()?;
|
||||||
|
// if the document was already in the db we remove all of its field
|
||||||
|
// from the field distribution.
|
||||||
|
if self.replaced_documents_ids.contains(u32_key) {
|
||||||
|
let obkv = self.index.documents.get(wtxn, &BEU32::new(u32_key))?.ok_or(
|
||||||
|
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
|
||||||
|
)?;
|
||||||
|
|
||||||
|
for (key, _) in obkv.iter() {
|
||||||
|
let name =
|
||||||
|
self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||||
|
field_id: key,
|
||||||
|
process: "Computing field distribution in transform.",
|
||||||
|
})?;
|
||||||
|
// We checked that the document was in the db earlier. If we can't find it it means
|
||||||
|
// there is an inconsistency between the field distribution and the field id map.
|
||||||
|
let field = field_distribution.get_mut(name).ok_or(
|
||||||
|
FieldIdMapMissingEntry::FieldId {
|
||||||
|
field_id: key,
|
||||||
|
process: "Accessing field distribution in transform.",
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
*field -= 1;
|
||||||
|
if *field == 0 {
|
||||||
|
// since we were able to get the field right before it's safe to unwrap here
|
||||||
|
field_distribution.remove(name).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We increment all the field of the current document in the field distribution.
|
||||||
|
let obkv = KvReader::new(val);
|
||||||
|
|
||||||
|
for (key, _) in obkv.iter() {
|
||||||
|
let name =
|
||||||
|
self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||||
|
field_id: key,
|
||||||
|
process: "Computing field distribution in transform.",
|
||||||
|
})?;
|
||||||
|
*field_distribution.entry(name.to_string()).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
writer.insert(key, val)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut original_documents = writer.into_inner()?;
|
||||||
|
// We then extract the file and reset the seek to be able to read it again.
|
||||||
|
original_documents.seek(SeekFrom::Start(0))?;
|
||||||
|
|
||||||
|
// We create a final writer to write the new documents in order from the sorter.
|
||||||
|
let mut writer = create_writer(
|
||||||
|
self.indexer_settings.chunk_compression_type,
|
||||||
|
self.indexer_settings.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
// Once we have written all the documents into the final sorter, we write the documents
|
// Once we have written all the documents into the final sorter, we write the documents
|
||||||
// into this writer, extract the file and reset the seek to be able to read it again.
|
// into this writer, extract the file and reset the seek to be able to read it again.
|
||||||
final_sorter.write_into_stream_writer(&mut writer)?;
|
self.flattened_sorter.write_into_stream_writer(&mut writer)?;
|
||||||
let mut documents_file = writer.into_inner()?;
|
let mut flattened_documents = writer.into_inner()?;
|
||||||
documents_file.seek(SeekFrom::Start(0))?;
|
flattened_documents.seek(SeekFrom::Start(0))?;
|
||||||
|
|
||||||
let before_docids_merging = Instant::now();
|
let mut new_external_documents_ids_builder: Vec<_> =
|
||||||
// We merge the new external ids with existing external documents ids.
|
self.new_external_documents_ids_builder.into_iter().collect();
|
||||||
let new_external_documents_ids = new_external_documents_ids_builder.into_map();
|
|
||||||
|
new_external_documents_ids_builder
|
||||||
|
.sort_unstable_by(|(left, _), (right, _)| left.cmp(&right));
|
||||||
|
let mut fst_new_external_documents_ids_builder = fst::MapBuilder::memory();
|
||||||
|
new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| {
|
||||||
|
fst_new_external_documents_ids_builder.insert(key, value)
|
||||||
|
})?;
|
||||||
|
let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map();
|
||||||
external_documents_ids.insert_ids(&new_external_documents_ids)?;
|
external_documents_ids.insert_ids(&new_external_documents_ids)?;
|
||||||
|
|
||||||
info!("Documents external merging took {:.02?}", before_docids_merging.elapsed());
|
|
||||||
|
|
||||||
Ok(TransformOutput {
|
Ok(TransformOutput {
|
||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map,
|
fields_ids_map: self.fields_ids_map,
|
||||||
field_distribution,
|
field_distribution,
|
||||||
external_documents_ids: external_documents_ids.into_static(),
|
external_documents_ids: external_documents_ids.into_static(),
|
||||||
new_documents_ids,
|
new_documents_ids: self.new_documents_ids,
|
||||||
replaced_documents_ids,
|
replaced_documents_ids: self.replaced_documents_ids,
|
||||||
documents_count,
|
documents_count: self.documents_count,
|
||||||
documents_file,
|
original_documents,
|
||||||
|
flattened_documents,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -412,7 +540,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
self,
|
self,
|
||||||
wtxn: &mut heed::RwTxn,
|
wtxn: &mut heed::RwTxn,
|
||||||
old_fields_ids_map: FieldsIdsMap,
|
old_fields_ids_map: FieldsIdsMap,
|
||||||
new_fields_ids_map: FieldsIdsMap,
|
mut new_fields_ids_map: FieldsIdsMap,
|
||||||
) -> Result<TransformOutput> {
|
) -> Result<TransformOutput> {
|
||||||
// There already has been a document addition, the primary key should be set by now.
|
// There already has been a document addition, the primary key should be set by now.
|
||||||
let primary_key =
|
let primary_key =
|
||||||
@ -423,7 +551,14 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let documents_count = documents_ids.len() as usize;
|
let documents_count = documents_ids.len() as usize;
|
||||||
|
|
||||||
// We create a final writer to write the new documents in order from the sorter.
|
// We create a final writer to write the new documents in order from the sorter.
|
||||||
let mut writer = create_writer(
|
let mut original_writer = create_writer(
|
||||||
|
self.indexer_settings.chunk_compression_type,
|
||||||
|
self.indexer_settings.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
|
// We create a final writer to write the new documents in order from the sorter.
|
||||||
|
let mut flattened_writer = create_writer(
|
||||||
self.indexer_settings.chunk_compression_type,
|
self.indexer_settings.chunk_compression_type,
|
||||||
self.indexer_settings.chunk_compression_level,
|
self.indexer_settings.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
@ -445,13 +580,51 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let buffer = obkv_writer.into_inner()?;
|
let buffer = obkv_writer.into_inner()?;
|
||||||
writer.insert(docid.to_be_bytes(), buffer)?;
|
original_writer.insert(docid.to_be_bytes(), &buffer)?;
|
||||||
|
|
||||||
|
// Once we have the document. We're going to flatten it
|
||||||
|
// and insert it in the flattened sorter.
|
||||||
|
let mut doc = serde_json::Map::new();
|
||||||
|
|
||||||
|
let reader = obkv::KvReader::new(buffer);
|
||||||
|
for (k, v) in reader.iter() {
|
||||||
|
let key = new_fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||||
|
field_id: k,
|
||||||
|
process: "Accessing field distribution in transform.",
|
||||||
|
})?;
|
||||||
|
let value = serde_json::from_slice::<serde_json::Value>(v)
|
||||||
|
.map_err(InternalError::SerdeJson)?;
|
||||||
|
doc.insert(key.to_string(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
let flattened = flatten_serde_json::flatten(&doc);
|
||||||
|
|
||||||
|
// Once we have the flattened version we can convert it back to obkv and
|
||||||
|
// insert all the new generated fields_ids (if any) in the fields ids map.
|
||||||
|
let mut buffer: Vec<u8> = Vec::new();
|
||||||
|
let mut writer = KvWriter::new(&mut buffer);
|
||||||
|
let mut flattened: Vec<_> = flattened.into_iter().collect();
|
||||||
|
// we reorder the field to get all the known field first
|
||||||
|
flattened.sort_unstable_by_key(|(key, _)| {
|
||||||
|
new_fields_ids_map.id(&key).unwrap_or(FieldId::MAX)
|
||||||
|
});
|
||||||
|
|
||||||
|
for (key, value) in flattened {
|
||||||
|
let fid =
|
||||||
|
new_fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
|
||||||
|
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||||
|
writer.insert(fid, &value)?;
|
||||||
|
}
|
||||||
|
flattened_writer.insert(docid.to_be_bytes(), &buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Once we have written all the documents, we extract
|
// Once we have written all the documents, we extract
|
||||||
// the file and reset the seek to be able to read it again.
|
// the file and reset the seek to be able to read it again.
|
||||||
let mut documents_file = writer.into_inner()?;
|
let mut original_documents = original_writer.into_inner()?;
|
||||||
documents_file.seek(SeekFrom::Start(0))?;
|
original_documents.seek(SeekFrom::Start(0))?;
|
||||||
|
|
||||||
|
let mut flattened_documents = flattened_writer.into_inner()?;
|
||||||
|
flattened_documents.seek(SeekFrom::Start(0))?;
|
||||||
|
|
||||||
Ok(TransformOutput {
|
Ok(TransformOutput {
|
||||||
primary_key,
|
primary_key,
|
||||||
@ -461,7 +634,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
new_documents_ids: documents_ids,
|
new_documents_ids: documents_ids,
|
||||||
replaced_documents_ids: RoaringBitmap::default(),
|
replaced_documents_ids: RoaringBitmap::default(),
|
||||||
documents_count,
|
documents_count,
|
||||||
documents_file,
|
original_documents,
|
||||||
|
flattened_documents,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -521,11 +695,84 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {
|
|||||||
vec.into_iter().map(|_| unreachable!()).collect()
|
vec.into_iter().map(|_| unreachable!()).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn update_primary_key<'a>(
|
||||||
|
document: KvReader<'a, FieldId>,
|
||||||
|
addition_index: &DocumentsBatchIndex,
|
||||||
|
primary_key_id: FieldId,
|
||||||
|
primary_key_name: &str,
|
||||||
|
uuid_buffer: &'a mut [u8; uuid::adapter::Hyphenated::LENGTH],
|
||||||
|
field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>,
|
||||||
|
mut external_id_buffer: &'a mut Vec<u8>,
|
||||||
|
autogenerate_docids: bool,
|
||||||
|
) -> Result<Cow<'a, str>> {
|
||||||
|
match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) {
|
||||||
|
Some((_, bytes)) => {
|
||||||
|
let value = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
|
||||||
|
Value::String(string) => match validate_document_id(&string) {
|
||||||
|
Some(s) if s.len() == string.len() => string,
|
||||||
|
Some(s) => s.to_string(),
|
||||||
|
None => {
|
||||||
|
return Err(UserError::InvalidDocumentId {
|
||||||
|
document_id: Value::String(string),
|
||||||
|
}
|
||||||
|
.into())
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Value::Number(number) => number.to_string(),
|
||||||
|
content => {
|
||||||
|
return Err(UserError::InvalidDocumentId { document_id: content.clone() }.into())
|
||||||
|
}
|
||||||
|
};
|
||||||
|
serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?;
|
||||||
|
Ok(Cow::Owned(value))
|
||||||
|
}
|
||||||
|
None if autogenerate_docids => {
|
||||||
|
let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer);
|
||||||
|
serde_json::to_writer(&mut external_id_buffer, &uuid)
|
||||||
|
.map_err(InternalError::SerdeJson)?;
|
||||||
|
field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into()));
|
||||||
|
Ok(Cow::Borrowed(&*uuid))
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let mut json = Map::new();
|
||||||
|
for (key, value) in document.iter() {
|
||||||
|
let key = addition_index.name(key).cloned();
|
||||||
|
let value = serde_json::from_slice::<Value>(&value).ok();
|
||||||
|
|
||||||
|
if let Some((k, v)) = key.zip(value) {
|
||||||
|
json.insert(k, v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(UserError::MissingDocumentId {
|
||||||
|
primary_key: primary_key_name.to_string(),
|
||||||
|
document: json,
|
||||||
|
})?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TransformOutput {
|
||||||
|
// find and insert the new field ids
|
||||||
|
pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> {
|
||||||
|
let user_defined_facets = index.user_defined_faceted_fields(rtxn)?;
|
||||||
|
|
||||||
|
Ok(self
|
||||||
|
.fields_ids_map
|
||||||
|
.names()
|
||||||
|
.filter(|&field| crate::is_faceted(field, &user_defined_facets))
|
||||||
|
.map(|field| field.to_string())
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
mod compute_primary_key {
|
mod compute_primary_key {
|
||||||
|
use big_s::S;
|
||||||
|
|
||||||
use super::{compute_primary_key_pair, FieldsIdsMap};
|
use super::{compute_primary_key_pair, FieldsIdsMap};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -540,6 +787,18 @@ mod test {
|
|||||||
);
|
);
|
||||||
assert_eq!(result.unwrap(), (0, "toto".to_string()));
|
assert_eq!(result.unwrap(), (0, "toto".to_string()));
|
||||||
assert_eq!(fields_map.len(), 1);
|
assert_eq!(fields_map.len(), 1);
|
||||||
|
|
||||||
|
// and with nested fields
|
||||||
|
let mut fields_map = FieldsIdsMap::new();
|
||||||
|
fields_map.insert("toto.tata").unwrap();
|
||||||
|
let result = compute_primary_key_pair(
|
||||||
|
Some("toto.tata"),
|
||||||
|
&mut fields_map,
|
||||||
|
Some(S("titi")),
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
assert_eq!(result.unwrap(), (0, "toto.tata".to_string()));
|
||||||
|
assert_eq!(fields_map.len(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -547,7 +806,7 @@ mod test {
|
|||||||
let mut fields_map = FieldsIdsMap::new();
|
let mut fields_map = FieldsIdsMap::new();
|
||||||
let result =
|
let result =
|
||||||
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
|
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
|
||||||
assert_eq!(result.unwrap(), (0, "tata".to_string()));
|
assert_eq!(result.unwrap(), (0, S("tata")));
|
||||||
assert_eq!(fields_map.len(), 1);
|
assert_eq!(fields_map.len(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -555,7 +814,7 @@ mod test {
|
|||||||
fn should_return_default_if_both_are_none() {
|
fn should_return_default_if_both_are_none() {
|
||||||
let mut fields_map = FieldsIdsMap::new();
|
let mut fields_map = FieldsIdsMap::new();
|
||||||
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
|
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
|
||||||
assert_eq!(result.unwrap(), (0, "id".to_string()));
|
assert_eq!(result.unwrap(), (0, S("id")));
|
||||||
assert_eq!(fields_map.len(), 1);
|
assert_eq!(fields_map.len(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -569,6 +828,7 @@ mod test {
|
|||||||
}
|
}
|
||||||
|
|
||||||
mod primary_key_inference {
|
mod primary_key_inference {
|
||||||
|
use big_s::S;
|
||||||
use bimap::BiHashMap;
|
use bimap::BiHashMap;
|
||||||
|
|
||||||
use crate::documents::DocumentsBatchIndex;
|
use crate::documents::DocumentsBatchIndex;
|
||||||
@ -579,11 +839,11 @@ mod test {
|
|||||||
// We run the test multiple times to change the order in which the fields are iterated upon.
|
// We run the test multiple times to change the order in which the fields are iterated upon.
|
||||||
for _ in 1..50 {
|
for _ in 1..50 {
|
||||||
let mut map = BiHashMap::new();
|
let mut map = BiHashMap::new();
|
||||||
map.insert(1, "fakeId".to_string());
|
map.insert(1, S("fakeId"));
|
||||||
map.insert(2, "fakeId".to_string());
|
map.insert(2, S("fakeId"));
|
||||||
map.insert(3, "fakeId".to_string());
|
map.insert(3, S("fakeId"));
|
||||||
map.insert(4, "fakeId".to_string());
|
map.insert(4, S("fakeId"));
|
||||||
map.insert(0, "realId".to_string());
|
map.insert(0, S("realId"));
|
||||||
|
|
||||||
assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId"));
|
assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId"));
|
||||||
}
|
}
|
||||||
|
@ -249,11 +249,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let transform = Transform::new(
|
let transform = Transform::new(
|
||||||
|
self.wtxn,
|
||||||
&self.index,
|
&self.index,
|
||||||
&self.indexer_config,
|
&self.indexer_config,
|
||||||
IndexDocumentsMethod::ReplaceDocuments,
|
IndexDocumentsMethod::ReplaceDocuments,
|
||||||
false,
|
false,
|
||||||
);
|
)?;
|
||||||
|
|
||||||
// We remap the documents fields based on the new `FieldsIdsMap`.
|
// We remap the documents fields based on the new `FieldsIdsMap`.
|
||||||
let output = transform.remap_index_documents(
|
let output = transform.remap_index_documents(
|
||||||
@ -262,6 +263,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
fields_ids_map.clone(),
|
fields_ids_map.clone(),
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
let new_facets = output.compute_real_facets(self.wtxn, self.index)?;
|
||||||
|
self.index.put_faceted_fields(self.wtxn, &new_facets)?;
|
||||||
|
|
||||||
// We clear the full database (words-fst, documents ids and documents content).
|
// We clear the full database (words-fst, documents ids and documents content).
|
||||||
ClearDocuments::new(self.wtxn, self.index).execute()?;
|
ClearDocuments::new(self.wtxn, self.index).execute()?;
|
||||||
|
|
||||||
@ -273,7 +277,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
&self.indexer_config,
|
&self.indexer_config,
|
||||||
IndexDocumentsConfig::default(),
|
IndexDocumentsConfig::default(),
|
||||||
&cb,
|
&cb,
|
||||||
);
|
)?;
|
||||||
indexing_builder.execute_raw(output)?;
|
indexing_builder.execute_raw(output)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -583,7 +587,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
{
|
{
|
||||||
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
||||||
|
|
||||||
let old_faceted_fields = self.index.faceted_fields(&self.wtxn)?;
|
let old_faceted_fields = self.index.user_defined_faceted_fields(&self.wtxn)?;
|
||||||
let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?;
|
let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?;
|
||||||
|
|
||||||
self.update_displayed()?;
|
self.update_displayed()?;
|
||||||
@ -599,7 +603,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
// If there is new faceted fields we indicate that we must reindex as we must
|
// If there is new faceted fields we indicate that we must reindex as we must
|
||||||
// index new fields as facets. It means that the distinct attribute,
|
// index new fields as facets. It means that the distinct attribute,
|
||||||
// an Asc/Desc criterion or a filtered attribute as be added or removed.
|
// an Asc/Desc criterion or a filtered attribute as be added or removed.
|
||||||
let new_faceted_fields = self.index.faceted_fields(&self.wtxn)?;
|
let new_faceted_fields = self.index.user_defined_faceted_fields(&self.wtxn)?;
|
||||||
let faceted_updated = old_faceted_fields != new_faceted_fields;
|
let faceted_updated = old_faceted_fields != new_faceted_fields;
|
||||||
|
|
||||||
let stop_words_updated = self.update_stop_words()?;
|
let stop_words_updated = self.update_stop_words()?;
|
||||||
@ -651,7 +655,8 @@ mod tests {
|
|||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -713,7 +718,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -764,7 +770,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -793,7 +800,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -846,7 +854,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -858,7 +867,6 @@ mod tests {
|
|||||||
// Only count the field_id 0 and level 0 facet values.
|
// Only count the field_id 0 and level 0 facet values.
|
||||||
// TODO we must support typed CSVs for numbers to be understood.
|
// TODO we must support typed CSVs for numbers to be understood.
|
||||||
let fidmap = index.fields_ids_map(&rtxn).unwrap();
|
let fidmap = index.fields_ids_map(&rtxn).unwrap();
|
||||||
println!("fidmap: {:?}", fidmap);
|
|
||||||
for document in index.all_documents(&rtxn).unwrap() {
|
for document in index.all_documents(&rtxn).unwrap() {
|
||||||
let document = document.unwrap();
|
let document = document.unwrap();
|
||||||
let json = crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1)
|
let json = crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1)
|
||||||
@ -886,7 +894,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -927,7 +936,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -977,7 +987,51 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
|
builder.add_documents(content).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// Run an empty query just to ensure that the search results are ordered.
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap();
|
||||||
|
|
||||||
|
// There must be at least one document with a 34 as the age.
|
||||||
|
assert_eq!(documents_ids.len(), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn set_nested_distinct_field() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
|
||||||
|
// Set the filterable fields to be the age.
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||||
|
// Don't display the generated `id` field.
|
||||||
|
builder.set_displayed_fields(vec![S("person")]);
|
||||||
|
builder.set_distinct_field(S("person.age"));
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
|
||||||
|
// Then index some documents.
|
||||||
|
let content = documents!([
|
||||||
|
{ "person": { "name": "kevin", "age": 23 }},
|
||||||
|
{ "person": { "name": "kevina", "age": 21 }},
|
||||||
|
{ "person": { "name": "benoit", "age": 34 }},
|
||||||
|
{ "person": { "name": "bernard", "age": 34 }},
|
||||||
|
{ "person": { "name": "bertrand", "age": 34 }},
|
||||||
|
{ "person": { "name": "bernie", "age": 34 }},
|
||||||
|
{ "person": { "name": "ben", "age": 34 }}
|
||||||
|
]);
|
||||||
|
let indexing_config =
|
||||||
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1008,7 +1062,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1037,7 +1092,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -1115,7 +1171,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
|
|
||||||
@ -1252,7 +1309,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1314,7 +1372,8 @@ mod tests {
|
|||||||
let indexing_config =
|
let indexing_config =
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder =
|
let mut builder =
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ());
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
builder.add_documents(content).unwrap();
|
builder.add_documents(content).unwrap();
|
||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
@ -59,7 +59,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||||||
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||||
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
|
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
let mut cursor = Cursor::new(Vec::new());
|
||||||
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
let reader = Cursor::new(CONTENT.as_bytes());
|
let reader = Cursor::new(CONTENT.as_bytes());
|
||||||
|
@ -390,7 +390,8 @@ fn criteria_ascdesc() {
|
|||||||
// index documents
|
// index documents
|
||||||
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||||
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
|
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
let mut cursor = Cursor::new(Vec::new());
|
||||||
let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||||
|
@ -130,7 +130,8 @@ fn test_typo_disabled_on_word() {
|
|||||||
let mut txn = index.write_txn().unwrap();
|
let mut txn = index.write_txn().unwrap();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
let mut builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ());
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
|
||||||
|
|
||||||
builder.add_documents(documents).unwrap();
|
builder.add_documents(documents).unwrap();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user