diff --git a/.github/workflows/benchmarks-pr.yml b/.github/workflows/benchmarks-pr.yml
index aa784296a..30baa294e 100644
--- a/.github/workflows/benchmarks-pr.yml
+++ b/.github/workflows/benchmarks-pr.yml
@@ -90,7 +90,8 @@ jobs:
set -x
export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
- echo 'Here are your benchmarks diff 👊' >> body.txt
+ export bench_name=$(echo ${{ steps.command.outputs.command-arguments }})
+ echo "Here are your $bench_name benchmarks diff 👊" >> body.txt
echo '```' >> body.txt
./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
echo '```' >> body.txt
diff --git a/.github/workflows/publish-apt-brew-pkg.yml b/.github/workflows/publish-apt-brew-pkg.yml
index 452776e38..11893bae0 100644
--- a/.github/workflows/publish-apt-brew-pkg.yml
+++ b/.github/workflows/publish-apt-brew-pkg.yml
@@ -50,7 +50,7 @@ jobs:
needs: check-version
steps:
- name: Create PR to Homebrew
- uses: mislav/bump-homebrew-formula-action@v2
+ uses: mislav/bump-homebrew-formula-action@v3
with:
formula-name: meilisearch
formula-path: Formula/m/meilisearch.rb
diff --git a/.github/workflows/publish-docker-images.yml b/.github/workflows/publish-docker-images.yml
index 051fb105d..1ee8ba4d0 100644
--- a/.github/workflows/publish-docker-images.yml
+++ b/.github/workflows/publish-docker-images.yml
@@ -63,7 +63,7 @@ jobs:
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
- uses: docker/login-action@v2
+ uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
diff --git a/.github/workflows/sdks-tests.yml b/.github/workflows/sdks-tests.yml
index 05cf6b91c..7b6ea74de 100644
--- a/.github/workflows/sdks-tests.yml
+++ b/.github/workflows/sdks-tests.yml
@@ -160,7 +160,7 @@ jobs:
with:
repository: meilisearch/meilisearch-js
- name: Setup node
- uses: actions/setup-node@v3
+ uses: actions/setup-node@v4
with:
cache: 'yarn'
- name: Install dependencies
@@ -318,7 +318,7 @@ jobs:
with:
repository: meilisearch/meilisearch-js-plugins
- name: Setup node
- uses: actions/setup-node@v3
+ uses: actions/setup-node@v4
with:
cache: yarn
- name: Install dependencies
diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml
index a44a843fe..ed9cafa79 100644
--- a/.github/workflows/test-suite.yml
+++ b/.github/workflows/test-suite.yml
@@ -43,7 +43,7 @@ jobs:
toolchain: nightly
override: true
- name: Cache dependencies
- uses: Swatinem/rust-cache@v2.6.2
+ uses: Swatinem/rust-cache@v2.7.1
- name: Run cargo check without any default features
uses: actions-rs/cargo@v1
with:
@@ -65,7 +65,7 @@ jobs:
steps:
- uses: actions/checkout@v3
- name: Cache dependencies
- uses: Swatinem/rust-cache@v2.6.2
+ uses: Swatinem/rust-cache@v2.7.1
- name: Run cargo check without any default features
uses: actions-rs/cargo@v1
with:
@@ -149,7 +149,7 @@ jobs:
toolchain: stable
override: true
- name: Cache dependencies
- uses: Swatinem/rust-cache@v2.6.2
+ uses: Swatinem/rust-cache@v2.7.1
- name: Run tests in debug
uses: actions-rs/cargo@v1
with:
@@ -168,7 +168,7 @@ jobs:
override: true
components: clippy
- name: Cache dependencies
- uses: Swatinem/rust-cache@v2.6.2
+ uses: Swatinem/rust-cache@v2.7.1
- name: Run cargo clippy
uses: actions-rs/cargo@v1
with:
@@ -187,7 +187,7 @@ jobs:
override: true
components: rustfmt
- name: Cache dependencies
- uses: Swatinem/rust-cache@v2.6.2
+ uses: Swatinem/rust-cache@v2.7.1
- name: Run cargo fmt
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
diff --git a/Cargo.lock b/Cargo.lock
index 017257512..75d8463e7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1731,12 +1731,13 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "grenad"
-version = "0.4.4"
+version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93"
+checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c"
dependencies = [
"bytemuck",
"byteorder",
+ "rayon",
"tempfile",
]
@@ -3281,6 +3282,7 @@ dependencies = [
"logging_timer",
"maplit",
"md5",
+ "meili-snap",
"memmap2",
"mimalloc",
"obkv",
@@ -3443,9 +3445,9 @@ dependencies = [
[[package]]
name = "obkv"
-version = "0.2.0"
+version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
+checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080"
[[package]]
name = "once_cell"
diff --git a/README.md b/README.md
index cb9475dea..88621729d 100644
--- a/README.md
+++ b/README.md
@@ -25,12 +25,6 @@
⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍
----
-
-### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A!
-
----
-
Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs
index 9446c0b0f..65f581b93 100644
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@@ -6,9 +6,7 @@ use std::path::Path;
use criterion::{criterion_group, criterion_main, Criterion};
use milli::heed::{EnvOpenOptions, RwTxn};
-use milli::update::{
- DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
-};
+use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::Index;
use rand::seq::SliceRandom;
use rand_chacha::rand_core::SeedableRng;
@@ -266,17 +264,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
- let mut wtxn = index.write_txn().unwrap();
-
- for ids in document_ids_to_delete {
- let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
- builder.delete_documents(&ids);
- builder.execute().unwrap();
- }
-
- wtxn.commit().unwrap();
-
- index.prepare_for_closing().wait();
+ delete_documents_from_ids(index, document_ids_to_delete)
},
)
});
@@ -613,17 +601,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
- let mut wtxn = index.write_txn().unwrap();
-
- for ids in document_ids_to_delete {
- let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
- builder.delete_documents(&ids);
- builder.execute().unwrap();
- }
-
- wtxn.commit().unwrap();
-
- index.prepare_for_closing().wait();
+ delete_documents_from_ids(index, document_ids_to_delete)
},
)
});
@@ -875,22 +853,31 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
- let mut wtxn = index.write_txn().unwrap();
-
- for ids in document_ids_to_delete {
- let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
- builder.delete_documents(&ids);
- builder.execute().unwrap();
- }
-
- wtxn.commit().unwrap();
-
- index.prepare_for_closing().wait();
+ delete_documents_from_ids(index, document_ids_to_delete)
},
)
});
}
+fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec) {
+ let mut wtxn = index.write_txn().unwrap();
+
+ let indexer_config = IndexerConfig::default();
+ for ids in document_ids_to_delete {
+ let config = IndexDocumentsConfig::default();
+
+ let mut builder =
+ IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
+ .unwrap();
+ (builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap();
+ builder.execute().unwrap();
+ }
+
+ wtxn.commit().unwrap();
+
+ index.prepare_for_closing().wait();
+}
+
fn indexing_movies_in_three_batches(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
@@ -1112,17 +1099,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
- let mut wtxn = index.write_txn().unwrap();
-
- for ids in document_ids_to_delete {
- let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
- builder.delete_documents(&ids);
- builder.execute().unwrap();
- }
-
- wtxn.commit().unwrap();
-
- index.prepare_for_closing().wait();
+ delete_documents_from_ids(index, document_ids_to_delete)
},
)
});
@@ -1338,17 +1315,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
- let mut wtxn = index.write_txn().unwrap();
-
- for ids in document_ids_to_delete {
- let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
- builder.delete_documents(&ids);
- builder.execute().unwrap();
- }
-
- wtxn.commit().unwrap();
-
- index.prepare_for_closing().wait();
+ delete_documents_from_ids(index, document_ids_to_delete)
},
)
});
diff --git a/dump/src/reader/mod.rs b/dump/src/reader/mod.rs
index af02888d2..603c557d6 100644
--- a/dump/src/reader/mod.rs
+++ b/dump/src/reader/mod.rs
@@ -526,12 +526,12 @@ pub(crate) mod test {
assert!(indexes.is_empty());
// products
- insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(products.metadata(), @r###"
{
"uid": "products",
"primaryKey": "sku",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2022-10-09T20:27:22.688964637Z",
+ "updatedAt": "2022-10-09T20:27:23.951017769Z"
}
"###);
@@ -541,12 +541,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
// movies
- insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(movies.metadata(), @r###"
{
"uid": "movies",
"primaryKey": "id",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2022-10-09T20:27:22.197788495Z",
+ "updatedAt": "2022-10-09T20:28:01.93111053Z"
}
"###);
@@ -571,12 +571,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
// spells
- insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(spells.metadata(), @r###"
{
"uid": "dnd_spells",
"primaryKey": "index",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2022-10-09T20:27:24.242683494Z",
+ "updatedAt": "2022-10-09T20:27:24.312809641Z"
}
"###);
@@ -617,12 +617,12 @@ pub(crate) mod test {
assert!(indexes.is_empty());
// products
- insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(products.metadata(), @r###"
{
"uid": "products",
"primaryKey": "sku",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2023-01-30T16:25:56.595257Z",
+ "updatedAt": "2023-01-30T16:25:58.70348Z"
}
"###);
@@ -632,12 +632,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
// movies
- insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(movies.metadata(), @r###"
{
"uid": "movies",
"primaryKey": "id",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2023-01-30T16:25:56.192178Z",
+ "updatedAt": "2023-01-30T16:25:56.455714Z"
}
"###);
@@ -647,12 +647,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
// spells
- insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(spells.metadata(), @r###"
{
"uid": "dnd_spells",
"primaryKey": "index",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2023-01-30T16:25:58.876405Z",
+ "updatedAt": "2023-01-30T16:25:59.079906Z"
}
"###);
diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap
deleted file mode 100644
index 92fc61d72..000000000
--- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap
+++ /dev/null
@@ -1,24 +0,0 @@
----
-source: dump/src/reader/mod.rs
-expression: spells.settings().unwrap()
----
-{
- "displayedAttributes": [
- "*"
- ],
- "searchableAttributes": [
- "*"
- ],
- "filterableAttributes": [],
- "sortableAttributes": [],
- "rankingRules": [
- "typo",
- "words",
- "proximity",
- "attribute",
- "exactness"
- ],
- "stopWords": [],
- "synonyms": {},
- "distinctAttribute": null
-}
diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap
deleted file mode 100644
index b0b54c136..000000000
--- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap
+++ /dev/null
@@ -1,38 +0,0 @@
----
-source: dump/src/reader/mod.rs
-expression: products.settings().unwrap()
----
-{
- "displayedAttributes": [
- "*"
- ],
- "searchableAttributes": [
- "*"
- ],
- "filterableAttributes": [],
- "sortableAttributes": [],
- "rankingRules": [
- "typo",
- "words",
- "proximity",
- "attribute",
- "exactness"
- ],
- "stopWords": [],
- "synonyms": {
- "android": [
- "phone",
- "smartphone"
- ],
- "iphone": [
- "phone",
- "smartphone"
- ],
- "phone": [
- "android",
- "iphone",
- "smartphone"
- ]
- },
- "distinctAttribute": null
-}
diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap
deleted file mode 100644
index 5c12a0438..000000000
--- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap
+++ /dev/null
@@ -1,31 +0,0 @@
----
-source: dump/src/reader/mod.rs
-expression: movies.settings().unwrap()
----
-{
- "displayedAttributes": [
- "*"
- ],
- "searchableAttributes": [
- "*"
- ],
- "filterableAttributes": [
- "genres",
- "id"
- ],
- "sortableAttributes": [
- "genres",
- "id"
- ],
- "rankingRules": [
- "typo",
- "words",
- "proximity",
- "attribute",
- "exactness",
- "release_date:asc"
- ],
- "stopWords": [],
- "synonyms": {},
- "distinctAttribute": null
-}
diff --git a/dump/src/reader/v2/mod.rs b/dump/src/reader/v2/mod.rs
index 4016e6341..a0ff13a3b 100644
--- a/dump/src/reader/v2/mod.rs
+++ b/dump/src/reader/v2/mod.rs
@@ -46,6 +46,7 @@ pub type Checked = settings::Checked;
pub type Unchecked = settings::Unchecked;
pub type Task = updates::UpdateEntry;
+pub type Kind = updates::UpdateMeta;
// everything related to the errors
pub type ResponseError = errors::ResponseError;
@@ -107,8 +108,11 @@ impl V2Reader {
pub fn indexes(&self) -> Result> + '_> {
Ok(self.index_uuid.iter().map(|index| -> Result<_> {
V2IndexReader::new(
- index.uid.clone(),
&self.dump.path().join("indexes").join(format!("index-{}", index.uuid)),
+ index,
+ BufReader::new(
+ File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(),
+ ),
)
}))
}
@@ -143,16 +147,41 @@ pub struct V2IndexReader {
}
impl V2IndexReader {
- pub fn new(name: String, path: &Path) -> Result {
+ pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader) -> Result {
let meta = File::open(path.join("meta.json"))?;
let meta: DumpMeta = serde_json::from_reader(meta)?;
+ let mut created_at = None;
+ let mut updated_at = None;
+
+ for line in tasks.lines() {
+ let task: Task = serde_json::from_str(&line?)?;
+ if !(task.uuid == index_uuid.uuid && task.is_finished()) {
+ continue;
+ }
+
+ let new_created_at = match task.update.meta() {
+ Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(),
+ _ => None,
+ };
+ let new_updated_at = task.update.finished_at();
+
+ if created_at.is_none() || created_at > new_created_at {
+ created_at = new_created_at;
+ }
+
+ if updated_at.is_none() || updated_at < new_updated_at {
+ updated_at = new_updated_at;
+ }
+ }
+
+ let current_time = OffsetDateTime::now_utc();
+
let metadata = IndexMetadata {
- uid: name,
+ uid: index_uuid.uid.clone(),
primary_key: meta.primary_key,
- // FIXME: Iterate over the whole task queue to find the creation and last update date.
- created_at: OffsetDateTime::now_utc(),
- updated_at: OffsetDateTime::now_utc(),
+ created_at: created_at.unwrap_or(current_time),
+ updated_at: updated_at.unwrap_or(current_time),
};
let ret = V2IndexReader {
@@ -248,12 +277,12 @@ pub(crate) mod test {
assert!(indexes.is_empty());
// products
- insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(products.metadata(), @r###"
{
"uid": "products",
"primaryKey": "sku",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2022-10-09T20:27:22.688964637Z",
+ "updatedAt": "2022-10-09T20:27:23.951017769Z"
}
"###);
@@ -263,12 +292,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
// movies
- insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(movies.metadata(), @r###"
{
"uid": "movies",
"primaryKey": "id",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2022-10-09T20:27:22.197788495Z",
+ "updatedAt": "2022-10-09T20:28:01.93111053Z"
}
"###);
@@ -293,12 +322,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
// spells
- insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(spells.metadata(), @r###"
{
"uid": "dnd_spells",
"primaryKey": "index",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2022-10-09T20:27:24.242683494Z",
+ "updatedAt": "2022-10-09T20:27:24.312809641Z"
}
"###);
@@ -340,12 +369,12 @@ pub(crate) mod test {
assert!(indexes.is_empty());
// products
- insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(products.metadata(), @r###"
{
"uid": "products",
"primaryKey": "sku",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2023-01-30T16:25:56.595257Z",
+ "updatedAt": "2023-01-30T16:25:58.70348Z"
}
"###);
@@ -355,12 +384,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
// movies
- insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(movies.metadata(), @r###"
{
"uid": "movies",
"primaryKey": "id",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2023-01-30T16:25:56.192178Z",
+ "updatedAt": "2023-01-30T16:25:56.455714Z"
}
"###);
@@ -370,12 +399,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
// spells
- insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+ insta::assert_json_snapshot!(spells.metadata(), @r###"
{
"uid": "dnd_spells",
"primaryKey": "index",
- "createdAt": "[now]",
- "updatedAt": "[now]"
+ "createdAt": "2023-01-30T16:25:58.876405Z",
+ "updatedAt": "2023-01-30T16:25:59.079906Z"
}
"###);
diff --git a/dump/src/reader/v2/updates.rs b/dump/src/reader/v2/updates.rs
index 33d88d46f..bf1227c7a 100644
--- a/dump/src/reader/v2/updates.rs
+++ b/dump/src/reader/v2/updates.rs
@@ -227,4 +227,14 @@ impl UpdateStatus {
_ => None,
}
}
+
+ pub fn finished_at(&self) -> Option {
+ match self {
+ UpdateStatus::Processing(_) => None,
+ UpdateStatus::Enqueued(_) => None,
+ UpdateStatus::Processed(u) => Some(u.processed_at),
+ UpdateStatus::Aborted(_) => None,
+ UpdateStatus::Failed(u) => Some(u.failed_at),
+ }
+ }
}
diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs
index aa93cda2a..661285325 100644
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -24,14 +24,13 @@ use std::fs::{self, File};
use std::io::BufWriter;
use dump::IndexMetadata;
-use log::{debug, error, info};
+use log::{debug, error, info, trace};
use meilisearch_types::error::Code;
use meilisearch_types::heed::{RoTxn, RwTxn};
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
use meilisearch_types::milli::heed::CompactionOption;
use meilisearch_types::milli::update::{
- DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
- Settings as MilliSettings,
+ IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
};
use meilisearch_types::milli::{self, Filter, BEU32};
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
@@ -44,7 +43,7 @@ use uuid::Uuid;
use crate::autobatcher::{self, BatchKind};
use crate::utils::{self, swap_index_uid_in_task};
-use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId};
+use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId};
/// Represents a combination of tasks that can all be processed at the same time.
///
@@ -105,12 +104,6 @@ pub(crate) enum IndexOperation {
operations: Vec,
tasks: Vec,
},
- DocumentDeletion {
- index_uid: String,
- // The vec associated with each document deletion tasks.
- documents: Vec>,
- tasks: Vec,
- },
IndexDocumentDeletionByFilter {
index_uid: String,
task: Task,
@@ -162,7 +155,6 @@ impl Batch {
}
Batch::IndexOperation { op, .. } => match op {
IndexOperation::DocumentOperation { tasks, .. }
- | IndexOperation::DocumentDeletion { tasks, .. }
| IndexOperation::Settings { tasks, .. }
| IndexOperation::DocumentClear { tasks, .. } => {
tasks.iter().map(|task| task.uid).collect()
@@ -227,7 +219,6 @@ impl IndexOperation {
pub fn index_uid(&self) -> &str {
match self {
IndexOperation::DocumentOperation { index_uid, .. }
- | IndexOperation::DocumentDeletion { index_uid, .. }
| IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
| IndexOperation::DocumentClear { index_uid, .. }
| IndexOperation::Settings { index_uid, .. }
@@ -243,9 +234,6 @@ impl fmt::Display for IndexOperation {
IndexOperation::DocumentOperation { .. } => {
f.write_str("IndexOperation::DocumentOperation")
}
- IndexOperation::DocumentDeletion { .. } => {
- f.write_str("IndexOperation::DocumentDeletion")
- }
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
}
@@ -348,18 +336,27 @@ impl IndexScheduler {
BatchKind::DocumentDeletion { deletion_ids } => {
let tasks = self.get_existing_tasks(rtxn, deletion_ids)?;
- let mut documents = Vec::new();
+ let mut operations = Vec::with_capacity(tasks.len());
+ let mut documents_counts = Vec::with_capacity(tasks.len());
for task in &tasks {
match task.kind {
KindWithContent::DocumentDeletion { ref documents_ids, .. } => {
- documents.push(documents_ids.clone())
+ operations.push(DocumentOperation::Delete(documents_ids.clone()));
+ documents_counts.push(documents_ids.len() as u64);
}
_ => unreachable!(),
}
}
Ok(Some(Batch::IndexOperation {
- op: IndexOperation::DocumentDeletion { index_uid, documents, tasks },
+ op: IndexOperation::DocumentOperation {
+ index_uid,
+ primary_key: None,
+ method: IndexDocumentsMethod::ReplaceDocuments,
+ documents_counts,
+ operations,
+ tasks,
+ },
must_create_index,
}))
}
@@ -825,6 +822,10 @@ impl IndexScheduler {
// 2. dump the tasks
let mut dump_tasks = dump.create_tasks_queue()?;
for ret in self.all_tasks.iter(&rtxn)? {
+ if self.must_stop_processing.get() {
+ return Err(Error::AbortedTask);
+ }
+
let (_, mut t) = ret?;
let status = t.status;
let content_file = t.content_uuid();
@@ -845,6 +846,9 @@ impl IndexScheduler {
// 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
if let Some(content_file) = content_file {
+ if self.must_stop_processing.get() {
+ return Err(Error::AbortedTask);
+ }
if status == Status::Enqueued {
let content_file = self.file_store.get_update(content_file)?;
@@ -884,6 +888,9 @@ impl IndexScheduler {
// 3.1. Dump the documents
for ret in index.all_documents(&rtxn)? {
+ if self.must_stop_processing.get() {
+ return Err(Error::AbortedTask);
+ }
let (_id, doc) = ret?;
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
index_dumper.push_document(&document)?;
@@ -903,6 +910,9 @@ impl IndexScheduler {
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
)).unwrap();
+ if self.must_stop_processing.get() {
+ return Err(Error::AbortedTask);
+ }
let path = self.dumps_path.join(format!("{}.dump", dump_uid));
let file = File::create(path)?;
dump.persist_to(BufWriter::new(file))?;
@@ -1195,7 +1205,7 @@ impl IndexScheduler {
index,
indexer_config,
config,
- |indexing_step| debug!("update: {:?}", indexing_step),
+ |indexing_step| trace!("update: {:?}", indexing_step),
|| must_stop_processing.get(),
)?;
@@ -1242,7 +1252,8 @@ impl IndexScheduler {
let (new_builder, user_result) =
builder.remove_documents(document_ids)?;
builder = new_builder;
-
+ // Uses Invariant: remove documents actually always returns Ok for the inner result
+ let count = user_result.unwrap();
let provided_ids =
if let Some(Details::DocumentDeletion { provided_ids, .. }) =
task.details
@@ -1253,23 +1264,11 @@ impl IndexScheduler {
unreachable!();
};
- match user_result {
- Ok(count) => {
- task.status = Status::Succeeded;
- task.details = Some(Details::DocumentDeletion {
- provided_ids,
- deleted_documents: Some(count),
- });
- }
- Err(e) => {
- task.status = Status::Failed;
- task.details = Some(Details::DocumentDeletion {
- provided_ids,
- deleted_documents: Some(0),
- });
- task.error = Some(milli::Error::from(e).into());
- }
- }
+ task.status = Status::Succeeded;
+ task.details = Some(Details::DocumentDeletion {
+ provided_ids,
+ deleted_documents: Some(count),
+ });
}
}
}
@@ -1284,31 +1283,13 @@ impl IndexScheduler {
milli::update::Settings::new(index_wtxn, index, indexer_config);
builder.reset_primary_key();
builder.execute(
- |indexing_step| debug!("update: {:?}", indexing_step),
+ |indexing_step| trace!("update: {:?}", indexing_step),
|| must_stop_processing.clone().get(),
)?;
}
Ok(tasks)
}
- IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => {
- let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?;
- documents.iter().flatten().for_each(|id| {
- builder.delete_external_id(id);
- });
-
- let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?;
-
- for (task, documents) in tasks.iter_mut().zip(documents) {
- task.status = Status::Succeeded;
- task.details = Some(Details::DocumentDeletion {
- provided_ids: documents.len(),
- deleted_documents: Some(deleted_documents.min(documents.len() as u64)),
- });
- }
-
- Ok(tasks)
- }
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
let filter =
if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
@@ -1318,7 +1299,13 @@ impl IndexScheduler {
} else {
unreachable!()
};
- let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
+ let deleted_documents = delete_document_by_filter(
+ index_wtxn,
+ filter,
+ self.index_mapper.indexer_config(),
+ self.must_stop_processing.clone(),
+ index,
+ );
let original_filter = if let Some(Details::DocumentDeletionByFilter {
original_filter,
deleted_documents: _,
@@ -1552,6 +1539,8 @@ impl IndexScheduler {
fn delete_document_by_filter<'a>(
wtxn: &mut RwTxn<'a, '_>,
filter: &serde_json::Value,
+ indexer_config: &IndexerConfig,
+ must_stop_processing: MustStopProcessing,
index: &'a Index,
) -> Result {
let filter = Filter::from_json(filter)?;
@@ -1562,9 +1551,26 @@ fn delete_document_by_filter<'a>(
}
e => e.into(),
})?;
- let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
- delete_operation.delete_documents(&candidates);
- delete_operation.execute().map(|result| result.deleted_documents)?
+
+ let config = IndexDocumentsConfig {
+ update_method: IndexDocumentsMethod::ReplaceDocuments,
+ ..Default::default()
+ };
+
+ let mut builder = milli::update::IndexDocuments::new(
+ wtxn,
+ index,
+ indexer_config,
+ config,
+ |indexing_step| debug!("update: {:?}", indexing_step),
+ || must_stop_processing.get(),
+ )?;
+
+ let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?;
+ builder = new_builder;
+
+ let _ = builder.execute()?;
+ count
} else {
0
})
diff --git a/index-scheduler/src/error.rs b/index-scheduler/src/error.rs
index ddc6960f7..bbe526460 100644
--- a/index-scheduler/src/error.rs
+++ b/index-scheduler/src/error.rs
@@ -108,6 +108,8 @@ pub enum Error {
TaskDeletionWithEmptyQuery,
#[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")]
TaskCancelationWithEmptyQuery,
+ #[error("Aborted task")]
+ AbortedTask,
#[error(transparent)]
Dump(#[from] dump::Error),
@@ -175,6 +177,7 @@ impl Error {
| Error::TaskNotFound(_)
| Error::TaskDeletionWithEmptyQuery
| Error::TaskCancelationWithEmptyQuery
+ | Error::AbortedTask
| Error::Dump(_)
| Error::Heed(_)
| Error::Milli(_)
@@ -236,6 +239,9 @@ impl ErrorCode for Error {
Error::TaskDatabaseUpdate(_) => Code::Internal,
Error::CreateBatch(_) => Code::Internal,
+ // This one should never be seen by the end user
+ Error::AbortedTask => Code::Internal,
+
#[cfg(test)]
Error::PlannedFailure => Code::Internal,
}
diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs
index 95902aa15..896c06c99 100644
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@@ -1183,7 +1183,8 @@ impl IndexScheduler {
// If we have an abortion error we must stop the tick here and re-schedule tasks.
Err(Error::Milli(milli::Error::InternalError(
milli::InternalError::AbortedIndexation,
- ))) => {
+ )))
+ | Err(Error::AbortedTask) => {
#[cfg(test)]
self.breakpoint(Breakpoint::AbortedIndexation);
wtxn.abort().map_err(Error::HeedTransaction)?;
@@ -4339,4 +4340,26 @@ mod tests {
}
"###);
}
+
+ #[test]
+ fn cancel_processing_dump() {
+ let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
+
+ let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None };
+ let dump_cancellation = KindWithContent::TaskCancelation {
+ query: "cancel dump".to_owned(),
+ tasks: RoaringBitmap::from_iter([0]),
+ };
+ let _ = index_scheduler.register(dump_creation).unwrap();
+ snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register");
+ handle.advance_till([Start, BatchCreated, InsideProcessBatch]);
+
+ let _ = index_scheduler.register(dump_cancellation).unwrap();
+ snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered");
+
+ snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation");
+
+ handle.advance_one_successful_batch();
+ snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed");
+ }
}
diff --git a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap
new file mode 100644
index 000000000..ce0343975
--- /dev/null
+++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap
@@ -0,0 +1,35 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
+----------------------------------------------------------------------
+### Status:
+enqueued [0,]
+----------------------------------------------------------------------
+### Kind:
+"dumpCreation" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+----------------------------------------------------------------------
+### Started At:
+----------------------------------------------------------------------
+### Finished At:
+----------------------------------------------------------------------
+### File Store:
+
+----------------------------------------------------------------------
+
diff --git a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap
new file mode 100644
index 000000000..f3d7b363f
--- /dev/null
+++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap
@@ -0,0 +1,45 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: canceled, canceled_by: 1, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
+1 {uid: 1, status: succeeded, details: { matched_tasks: 1, canceled_tasks: Some(0), original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
+----------------------------------------------------------------------
+### Status:
+enqueued []
+succeeded [1,]
+canceled [0,]
+----------------------------------------------------------------------
+### Kind:
+"taskCancelation" [1,]
+"dumpCreation" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+1 [0,]
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Started At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Finished At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### File Store:
+
+----------------------------------------------------------------------
+
diff --git a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap
new file mode 100644
index 000000000..72ae58e00
--- /dev/null
+++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap
@@ -0,0 +1,38 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[0,]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
+1 {uid: 1, status: enqueued, details: { matched_tasks: 1, canceled_tasks: None, original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
+----------------------------------------------------------------------
+### Status:
+enqueued [0,1,]
+----------------------------------------------------------------------
+### Kind:
+"taskCancelation" [1,]
+"dumpCreation" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Started At:
+----------------------------------------------------------------------
+### Finished At:
+----------------------------------------------------------------------
+### File Store:
+
+----------------------------------------------------------------------
+
diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs
index 4b6711601..afe9c5189 100644
--- a/meilisearch-types/src/error.rs
+++ b/meilisearch-types/src/error.rs
@@ -324,7 +324,6 @@ impl ErrorCode for milli::Error {
UserError::SerdeJson(_)
| UserError::InvalidLmdbOpenOptions
| UserError::DocumentLimitReached
- | UserError::AccessingSoftDeletedDocument { .. }
| UserError::UnknownInternalDocumentId { .. } => Code::Internal,
UserError::InvalidStoreFile => Code::InvalidStoreFile,
UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,
diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs
index 603d8ff86..16c08c6c2 100644
--- a/meilisearch/src/lib.rs
+++ b/meilisearch/src/lib.rs
@@ -362,7 +362,7 @@ fn import_dump(
update_method: IndexDocumentsMethod::ReplaceDocuments,
..Default::default()
},
- |indexing_step| log::debug!("update: {:?}", indexing_step),
+ |indexing_step| log::trace!("update: {:?}", indexing_step),
|| false,
)?;
diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs
index 2afc1b5fb..b6950ae6e 100644
--- a/meilisearch/src/routes/indexes/documents.rs
+++ b/meilisearch/src/routes/indexes/documents.rs
@@ -612,8 +612,8 @@ fn retrieve_document>(
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let internal_id = index
- .external_documents_ids(&txn)?
- .get(doc_id.as_bytes())
+ .external_documents_ids()
+ .get(&txn, doc_id)?
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
let document = index
diff --git a/meilisearch/tests/documents/delete_documents.rs b/meilisearch/tests/documents/delete_documents.rs
index b3f04aea0..5a15e95ff 100644
--- a/meilisearch/tests/documents/delete_documents.rs
+++ b/meilisearch/tests/documents/delete_documents.rs
@@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() {
"canceledBy": null,
"details": {
"providedIds": 0,
- "deletedDocuments": 4,
+ "deletedDocuments": 2,
"originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]"
},
"error": null,
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index cf5fe9726..1d8517e73 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -26,8 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" }
fst = "0.4.7"
fxhash = "0.2.1"
geoutils = "0.5.1"
-grenad = { version = "0.4.4", default-features = false, features = [
- "tempfile",
+grenad = { version = "0.4.5", default-features = false, features = [
+ "rayon", "tempfile"
] }
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
"lmdb", "read-txn-no-tls"
@@ -79,6 +79,7 @@ big_s = "1.0.2"
insta = "1.29.0"
maplit = "1.0.2"
md5 = "0.7.0"
+meili-snap = { path = "../meili-snap" }
rand = { version = "0.8.5", features = ["small_rng"] }
[features]
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index 7c037b3bf..a874ac17e 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -1,5 +1,6 @@
mod builder;
mod enriched;
+mod primary_key;
mod reader;
mod serde_impl;
@@ -11,6 +12,7 @@ use bimap::BiHashMap;
pub use builder::DocumentsBatchBuilder;
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
use obkv::KvReader;
+pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY};
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
use serde::{Deserialize, Serialize};
@@ -87,6 +89,12 @@ impl DocumentsBatchIndex {
}
}
+impl FieldIdMapper for DocumentsBatchIndex {
+ fn id(&self, name: &str) -> Option {
+ self.id(name)
+ }
+}
+
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[error("Error parsing number {value:?} at line {line}: {error}")]
diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs
new file mode 100644
index 000000000..16a95c21f
--- /dev/null
+++ b/milli/src/documents/primary_key.rs
@@ -0,0 +1,172 @@
+use std::iter;
+use std::result::Result as StdResult;
+
+use serde_json::Value;
+
+use crate::{FieldId, InternalError, Object, Result, UserError};
+
+/// The symbol used to define levels in a nested primary key.
+const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
+
+/// The default primary that is used when not specified.
+pub const DEFAULT_PRIMARY_KEY: &str = "id";
+
+/// Trait for objects that can map the name of a field to its [`FieldId`].
+pub trait FieldIdMapper {
+ /// Attempts to map the passed name to its [`FieldId`].
+ ///
+ /// `None` if the field with this name was not found.
+ fn id(&self, name: &str) -> Option;
+}
+
+/// A type that represent the type of primary key that has been set
+/// for this index, a classic flat one or a nested one.
+#[derive(Debug, Clone, Copy)]
+pub enum PrimaryKey<'a> {
+ Flat { name: &'a str, field_id: FieldId },
+ Nested { name: &'a str },
+}
+
+pub enum DocumentIdExtractionError {
+ InvalidDocumentId(UserError),
+ MissingDocumentId,
+ TooManyDocumentIds(usize),
+}
+
+impl<'a> PrimaryKey<'a> {
+ pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option {
+ Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
+ Self::Nested { name: path }
+ } else {
+ let field_id = fields.id(path)?;
+ Self::Flat { name: path, field_id }
+ })
+ }
+
+ pub fn name(&self) -> &str {
+ match self {
+ PrimaryKey::Flat { name, .. } => name,
+ PrimaryKey::Nested { name } => name,
+ }
+ }
+
+ pub fn document_id(
+ &self,
+ document: &obkv::KvReader,
+ fields: &impl FieldIdMapper,
+ ) -> Result> {
+ match self {
+ PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) {
+ Some(document_id_bytes) => {
+ let document_id = serde_json::from_slice(document_id_bytes)
+ .map_err(InternalError::SerdeJson)?;
+ match validate_document_id_value(document_id)? {
+ Ok(document_id) => Ok(Ok(document_id)),
+ Err(user_error) => {
+ Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
+ }
+ }
+ }
+ None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
+ },
+ nested @ PrimaryKey::Nested { .. } => {
+ let mut matching_documents_ids = Vec::new();
+ for (first_level_name, right) in nested.possible_level_names() {
+ if let Some(field_id) = fields.id(first_level_name) {
+ if let Some(value_bytes) = document.get(field_id) {
+ let object = serde_json::from_slice(value_bytes)
+ .map_err(InternalError::SerdeJson)?;
+ fetch_matching_values(object, right, &mut matching_documents_ids);
+
+ if matching_documents_ids.len() >= 2 {
+ return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(
+ matching_documents_ids.len(),
+ )));
+ }
+ }
+ }
+ }
+
+ match matching_documents_ids.pop() {
+ Some(document_id) => match validate_document_id_value(document_id)? {
+ Ok(document_id) => Ok(Ok(document_id)),
+ Err(user_error) => {
+ Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
+ }
+ },
+ None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
+ }
+ }
+ }
+ }
+
+ /// Returns an `Iterator` that gives all the possible fields names the primary key
+ /// can have depending of the first level name and depth of the objects.
+ pub fn possible_level_names(&self) -> impl Iterator- + '_ {
+ let name = self.name();
+ name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
+ .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
+ .chain(iter::once((name, "")))
+ }
+}
+
+fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec
) {
+ match value {
+ Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
+ otherwise => output.push(otherwise),
+ }
+}
+
+fn fetch_matching_values_in_object(
+ object: Object,
+ selector: &str,
+ base_key: &str,
+ output: &mut Vec,
+) {
+ for (key, value) in object {
+ let base_key = if base_key.is_empty() {
+ key.to_string()
+ } else {
+ format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
+ };
+
+ if starts_with(selector, &base_key) {
+ match value {
+ Value::Object(object) => {
+ fetch_matching_values_in_object(object, selector, &base_key, output)
+ }
+ value => output.push(value),
+ }
+ }
+ }
+}
+
+fn starts_with(selector: &str, key: &str) -> bool {
+ selector.strip_prefix(key).map_or(false, |tail| {
+ tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
+ })
+}
+
+// FIXME: move to a DocumentId struct
+
+fn validate_document_id(document_id: &str) -> Option<&str> {
+ if !document_id.is_empty()
+ && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
+ {
+ Some(document_id)
+ } else {
+ None
+ }
+}
+
+pub fn validate_document_id_value(document_id: Value) -> Result> {
+ match document_id {
+ Value::String(string) => match validate_document_id(&string) {
+ Some(s) if s.len() == string.len() => Ok(Ok(string)),
+ Some(s) => Ok(Ok(s.to_string())),
+ None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
+ },
+ Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
+ content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
+ }
+}
diff --git a/milli/src/error.rs b/milli/src/error.rs
index e9e1fddd3..b249f2977 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry {
#[derive(Error, Debug)]
pub enum UserError {
- #[error("A soft deleted internal document id have been used: `{document_id}`.")]
- AccessingSoftDeletedDocument { document_id: DocumentId },
#[error("A document cannot contain more than 65,535 fields.")]
AttributeLimitReached,
#[error(transparent)]
diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs
index 36b147336..ec419446c 100644
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -1,159 +1,75 @@
-use std::borrow::Cow;
use std::collections::HashMap;
-use std::convert::TryInto;
-use std::{fmt, str};
-use fst::map::IndexedValue;
-use fst::{IntoStreamer, Streamer};
-use roaring::RoaringBitmap;
+use heed::types::{OwnedType, Str};
+use heed::{Database, RoIter, RoTxn, RwTxn};
-const DELETED_ID: u64 = u64::MAX;
+use crate::{DocumentId, BEU32};
-pub struct ExternalDocumentsIds<'a> {
- pub(crate) hard: fst::Map>,
- pub(crate) soft: fst::Map>,
- soft_deleted_docids: RoaringBitmap,
+pub enum DocumentOperationKind {
+ Create,
+ Delete,
}
-impl<'a> ExternalDocumentsIds<'a> {
- pub fn new(
- hard: fst::Map>,
- soft: fst::Map>,
- soft_deleted_docids: RoaringBitmap,
- ) -> ExternalDocumentsIds<'a> {
- ExternalDocumentsIds { hard, soft, soft_deleted_docids }
- }
+pub struct DocumentOperation {
+ pub external_id: String,
+ pub internal_id: DocumentId,
+ pub kind: DocumentOperationKind,
+}
- pub fn into_static(self) -> ExternalDocumentsIds<'static> {
- ExternalDocumentsIds {
- hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
- soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
- soft_deleted_docids: self.soft_deleted_docids,
- }
+pub struct ExternalDocumentsIds(Database>);
+
+impl ExternalDocumentsIds {
+ pub fn new(db: Database>) -> ExternalDocumentsIds {
+ ExternalDocumentsIds(db)
}
/// Returns `true` if hard and soft external documents lists are empty.
- pub fn is_empty(&self) -> bool {
- self.hard.is_empty() && self.soft.is_empty()
+ pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result {
+ self.0.is_empty(rtxn).map_err(Into::into)
}
- pub fn get>(&self, external_id: A) -> Option {
- let external_id = external_id.as_ref();
- match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
- Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
- Some(id.try_into().unwrap())
- }
- _otherwise => None,
- }
- }
-
- /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
- /// don't contain any soft deleted document id.
- pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
- let mut new_hard_builder = fst::MapBuilder::memory();
-
- let union_op = self.hard.op().add(&self.soft).r#union();
- let mut iter = union_op.into_stream();
- while let Some((external_id, docids)) = iter.next() {
- // prefer selecting the ids from soft, always
- let id = indexed_last_value(docids).unwrap();
- if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) {
- new_hard_builder.insert(external_id, id)?;
- }
- }
- drop(iter);
-
- // Delete soft map completely
- self.soft = fst::Map::default().map_data(Cow::Owned)?;
- // We save the new map as the new hard map.
- self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
-
- Ok(())
- }
-
- pub fn insert_ids>(&mut self, other: &fst::Map) -> fst::Result<()> {
- let union_op = self.soft.op().add(other).r#union();
-
- let mut new_soft_builder = fst::MapBuilder::memory();
- let mut iter = union_op.into_stream();
- while let Some((external_id, marked_docids)) = iter.next() {
- let id = indexed_last_value(marked_docids).unwrap();
- new_soft_builder.insert(external_id, id)?;
- }
-
- drop(iter);
-
- // We save the new map as the new soft map.
- self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
- self.merge_soft_into_hard()
+ pub fn get>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result> {
+ Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get()))
}
/// An helper function to debug this type, returns an `HashMap` of both,
/// soft and hard fst maps, combined.
- pub fn to_hash_map(&self) -> HashMap {
- let mut map = HashMap::new();
-
- let union_op = self.hard.op().add(&self.soft).r#union();
- let mut iter = union_op.into_stream();
- while let Some((external_id, marked_docids)) = iter.next() {
- let id = indexed_last_value(marked_docids).unwrap();
- if id != DELETED_ID {
- let external_id = str::from_utf8(external_id).unwrap();
- map.insert(external_id.to_owned(), id.try_into().unwrap());
- }
+ pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result> {
+ let mut map = HashMap::default();
+ for result in self.0.iter(rtxn)? {
+ let (external, internal) = result?;
+ map.insert(external.to_owned(), internal.get());
}
-
- map
+ Ok(map)
}
- /// Return an fst of the combined hard and soft deleted ID.
- pub fn to_fst<'b>(&'b self) -> fst::Result>>> {
- if self.soft.is_empty() {
- return Ok(Cow::Borrowed(&self.hard));
- }
- let union_op = self.hard.op().add(&self.soft).r#union();
-
- let mut iter = union_op.into_stream();
- let mut new_hard_builder = fst::MapBuilder::memory();
- while let Some((external_id, marked_docids)) = iter.next() {
- let value = indexed_last_value(marked_docids).unwrap();
- if value != DELETED_ID {
- new_hard_builder.insert(external_id, value)?;
+ /// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
+ ///
+ /// If the list contains multiple operations on the same external id, then the result is unspecified.
+ ///
+ /// # Panics
+ ///
+ /// - If attempting to delete a document that doesn't exist
+ /// - If attempting to create a document that already exists
+ pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec) -> heed::Result<()> {
+ for DocumentOperation { external_id, internal_id, kind } in operations {
+ match kind {
+ DocumentOperationKind::Create => {
+ self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?;
+ }
+ DocumentOperationKind::Delete => {
+ if !self.0.delete(wtxn, &external_id)? {
+ panic!("Attempting to delete a non-existing document")
+ }
+ }
}
}
- drop(iter);
-
- Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
- }
-
- fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
- if self.soft.len() >= self.hard.len() / 2 {
- self.hard = self.to_fst()?.into_owned();
- self.soft = fst::Map::default().map_data(Cow::Owned)?;
- }
-
Ok(())
}
-}
-impl fmt::Debug for ExternalDocumentsIds<'_> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish()
+ /// Returns an iterator over all the external ids.
+ pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> {
+ self.0.iter(rtxn)
}
}
-
-impl Default for ExternalDocumentsIds<'static> {
- fn default() -> Self {
- ExternalDocumentsIds {
- hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
- soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
- soft_deleted_docids: RoaringBitmap::new(),
- }
- }
-}
-
-/// Returns the value of the `IndexedValue` with the highest _index_.
-fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option {
- indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
-}
diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs
index 810ff755b..9c1c87f82 100644
--- a/milli/src/fields_ids_map.rs
+++ b/milli/src/fields_ids_map.rs
@@ -81,6 +81,12 @@ impl Default for FieldsIdsMap {
}
}
+impl crate::documents::FieldIdMapper for FieldsIdsMap {
+ fn id(&self, name: &str) -> Option {
+ self.id(name)
+ }
+}
+
#[cfg(test)]
mod tests {
use super::*;
diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
index bf76287d8..f635e55af 100644
--- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
+++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
@@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use roaring::RoaringBitmap;
use crate::heed_codec::BytesDecodeOwned;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd};
/// This is the limit where using a byteorder became less size efficient
/// than using a direct roaring encoding, it is also the point where we are able
@@ -60,12 +61,16 @@ impl CboRoaringBitmapCodec {
/// if the merged values length is under the threshold, values are directly
/// serialized in the buffer else a RoaringBitmap is created from the
/// values and is serialized in the buffer.
- pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> {
+ pub fn merge_into(slices: I, buffer: &mut Vec) -> io::Result<()>
+ where
+ I: IntoIterator- ,
+ A: AsRef<[u8]>,
+ {
let mut roaring = RoaringBitmap::new();
let mut vec = Vec::new();
for bytes in slices {
- if bytes.len() <= THRESHOLD * size_of::
() {
+ if bytes.as_ref().len() <= THRESHOLD * size_of::() {
let mut reader = bytes.as_ref();
while let Ok(integer) = reader.read_u32::() {
vec.push(integer);
@@ -85,7 +90,7 @@ impl CboRoaringBitmapCodec {
}
} else {
// We can unwrap safely because the vector is sorted upper.
- let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
+ let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
roaring.serialize_into(buffer)?;
}
} else {
@@ -95,6 +100,33 @@ impl CboRoaringBitmapCodec {
Ok(())
}
+
+ /// Merges a DelAdd delta into a CboRoaringBitmap.
+ pub fn merge_deladd_into<'a>(
+ deladd: KvReaderDelAdd<'_>,
+ previous: &[u8],
+ buffer: &'a mut Vec,
+ ) -> io::Result> {
+ // Deserialize the bitmap that is already there
+ let mut previous = Self::deserialize_from(previous)?;
+
+ // Remove integers we no more want in the previous bitmap
+ if let Some(value) = deladd.get(DelAdd::Deletion) {
+ previous -= Self::deserialize_from(value)?;
+ }
+
+ // Insert the new integers we want in the previous bitmap
+ if let Some(value) = deladd.get(DelAdd::Addition) {
+ previous |= Self::deserialize_from(value)?;
+ }
+
+ if previous.is_empty() {
+ return Ok(None);
+ }
+
+ Self::serialize_into(&previous, buffer);
+ Ok(Some(&buffer[..]))
+ }
}
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
diff --git a/milli/src/index.rs b/milli/src/index.rs
index d563f852b..f8cceb0ef 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1,7 +1,6 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fs::File;
-use std::mem::size_of;
use std::path::Path;
use charabia::{Language, Script};
@@ -13,8 +12,8 @@ use rstar::RTree;
use time::OffsetDateTime;
use crate::distance::NDotProductPoint;
+use crate::documents::PrimaryKey;
use crate::error::{InternalError, UserError};
-use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap;
use crate::heed_codec::facet::{
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
@@ -42,7 +41,6 @@ pub mod main_key {
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
- pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids";
pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields";
pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
@@ -54,17 +52,13 @@ pub mod main_key {
/// It is concatenated with a big-endian encoded number (non-human readable).
/// e.g. vector-hnsw0x0032.
pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
- pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
- pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
pub const PRIMARY_KEY_KEY: &str = "primary-key";
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
- pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
pub const STOP_WORDS_KEY: &str = "stop-words";
pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens";
pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens";
pub const DICTIONARY_KEY: &str = "dictionary";
- pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
pub const SYNONYMS_KEY: &str = "synonyms";
pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms";
pub const WORDS_FST_KEY: &str = "words-fst";
@@ -87,10 +81,9 @@ pub mod db_name {
pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids";
+ pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids";
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
- pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
- pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids";
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
@@ -118,24 +111,23 @@ pub struct Index {
/// Contains many different types (e.g. the fields ids map).
pub(crate) main: PolyDatabase,
+ /// Maps the external documents ids with the internal document id.
+ pub external_documents_ids: Database>,
+
/// A word and all the documents ids containing the word.
- pub word_docids: Database,
+ pub word_docids: Database,
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
- pub exact_word_docids: Database,
+ pub exact_word_docids: Database,
/// A prefix of word and all the documents ids containing this prefix.
- pub word_prefix_docids: Database,
+ pub word_prefix_docids: Database,
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
- pub exact_word_prefix_docids: Database,
+ pub exact_word_prefix_docids: Database,
/// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database,
- /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
- pub word_prefix_pair_proximity_docids: Database,
- /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears.
- pub prefix_word_pair_proximity_docids: Database,
/// Maps the word and the position with the docids that corresponds to it.
pub word_position_docids: Database,
@@ -189,13 +181,15 @@ impl Index {
) -> Result {
use db_name::*;
- options.max_dbs(25);
+ options.max_dbs(24);
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?;
let mut wtxn = env.write_txn()?;
let main = env.create_poly_database(&mut wtxn, Some(MAIN))?;
let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?;
+ let external_documents_ids =
+ env.create_database(&mut wtxn, Some(EXTERNAL_DOCUMENTS_IDS))?;
let exact_word_docids = env.create_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?;
let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
let exact_word_prefix_docids =
@@ -204,10 +198,6 @@ impl Index {
env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
let script_language_docids =
env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?;
- let word_prefix_pair_proximity_docids =
- env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
- let prefix_word_pair_proximity_docids =
- env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?;
let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?;
let field_id_word_count_docids =
@@ -241,14 +231,13 @@ impl Index {
Ok(Index {
env,
main,
+ external_documents_ids,
word_docids,
exact_word_docids,
word_prefix_docids,
exact_word_prefix_docids,
word_pair_proximity_docids,
script_language_docids,
- word_prefix_pair_proximity_docids,
- prefix_word_pair_proximity_docids,
word_position_docids,
word_fid_docids,
word_prefix_position_docids,
@@ -372,29 +361,6 @@ impl Index {
Ok(count.unwrap_or_default())
}
- /* deleted documents ids */
-
- /// Writes the soft deleted documents ids.
- pub(crate) fn put_soft_deleted_documents_ids(
- &self,
- wtxn: &mut RwTxn,
- docids: &RoaringBitmap,
- ) -> heed::Result<()> {
- self.main.put::<_, Str, RoaringBitmapCodec>(
- wtxn,
- main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY,
- docids,
- )
- }
-
- /// Returns the soft deleted documents ids.
- pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result {
- Ok(self
- .main
- .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)?
- .unwrap_or_default())
- }
-
/* primary key */
/// Writes the documents primary key, this is the field name that is used to store the id.
@@ -415,45 +381,10 @@ impl Index {
/* external documents ids */
- /// Writes the external documents ids and internal ids (i.e. `u32`).
- pub(crate) fn put_external_documents_ids(
- &self,
- wtxn: &mut RwTxn,
- external_documents_ids: &ExternalDocumentsIds<'_>,
- ) -> heed::Result<()> {
- let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids;
- let hard = hard.as_fst().as_bytes();
- let soft = soft.as_fst().as_bytes();
- self.main.put::<_, Str, ByteSlice>(
- wtxn,
- main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY,
- hard,
- )?;
- self.main.put::<_, Str, ByteSlice>(
- wtxn,
- main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY,
- soft,
- )?;
- Ok(())
- }
-
/// Returns the external documents ids map which associate the external ids
/// with the internal ids (i.e. `u32`).
- pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> {
- let hard =
- self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?;
- let soft =
- self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?;
- let hard = match hard {
- Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?,
- None => fst::Map::default().map_data(Cow::Owned)?,
- };
- let soft = match soft {
- Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?,
- None => fst::Map::default().map_data(Cow::Owned)?,
- };
- let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;
- Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids))
+ pub fn external_documents_ids(&self) -> ExternalDocumentsIds {
+ ExternalDocumentsIds::new(self.external_documents_ids)
}
/* fields ids map */
@@ -926,44 +857,6 @@ impl Index {
/* faceted documents ids */
- /// Writes the documents ids that are faceted under this field id for the given facet type.
- pub fn put_faceted_documents_ids(
- &self,
- wtxn: &mut RwTxn,
- field_id: FieldId,
- facet_type: FacetType,
- docids: &RoaringBitmap,
- ) -> heed::Result<()> {
- let key = match facet_type {
- FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX,
- FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX,
- };
- let mut buffer = vec![0u8; key.len() + size_of::()];
- buffer[..key.len()].copy_from_slice(key.as_bytes());
- buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes());
- self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
- }
-
- /// Retrieve all the documents ids that are faceted under this field id for the given facet type.
- pub fn faceted_documents_ids(
- &self,
- rtxn: &RoTxn,
- field_id: FieldId,
- facet_type: FacetType,
- ) -> heed::Result {
- let key = match facet_type {
- FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX,
- FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX,
- };
- let mut buffer = vec![0u8; key.len() + size_of::()];
- buffer[..key.len()].copy_from_slice(key.as_bytes());
- buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes());
- match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
- Some(docids) => Ok(docids),
- None => Ok(RoaringBitmap::new()),
- }
- }
-
/// Retrieve all the documents which contain this field id set as null
pub fn null_faceted_documents_ids(
&self,
@@ -1246,12 +1139,7 @@ impl Index {
rtxn: &'t RoTxn,
ids: impl IntoIterator- + 'a,
) -> Result
)>> + 'a> {
- let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
-
Ok(ids.into_iter().map(move |id| {
- if soft_deleted_documents.contains(id) {
- return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?;
- }
let kv = self
.documents
.get(rtxn, &BEU32::new(id))?
@@ -1277,6 +1165,36 @@ impl Index {
self.iter_documents(rtxn, self.documents_ids(rtxn)?)
}
+ pub fn external_id_of<'a, 't: 'a>(
+ &'a self,
+ rtxn: &'t RoTxn,
+ ids: impl IntoIterator- + 'a,
+ ) -> Result
> + 'a> {
+ let fields = self.fields_ids_map(rtxn)?;
+
+ // uses precondition "never called on an empty index"
+ let primary_key = self.primary_key(rtxn)?.ok_or(InternalError::DatabaseMissingEntry {
+ db_name: db_name::MAIN,
+ key: Some(main_key::PRIMARY_KEY_KEY),
+ })?;
+ let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| {
+ InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldName {
+ field_name: primary_key.to_owned(),
+ process: "external_id_of",
+ })
+ })?;
+ Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> {
+ let (_docid, obkv) = entry?;
+ match primary_key.document_id(&obkv, &fields)? {
+ Ok(document_id) => Ok(document_id),
+ Err(_) => Err(InternalError::DocumentsError(
+ crate::documents::Error::InvalidDocumentFormat,
+ )
+ .into()),
+ }
+ }))
+ }
+
pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {
FacetDistribution::new(rtxn, self)
}
@@ -1477,14 +1395,10 @@ impl Index {
rtxn: &RoTxn,
key: &(Script, Language),
) -> heed::Result> {
- let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
- let doc_ids = self.script_language_docids.get(rtxn, key)?;
- Ok(doc_ids.map(|ids| ids - soft_deleted_documents))
+ self.script_language_docids.get(rtxn, key)
}
pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result>> {
- let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
-
let mut script_language: HashMap