diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 4ce2db180..1a7f49aae 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -23,7 +23,8 @@ A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. -**Meilisearch version:** [e.g. v0.20.0] +**Meilisearch version:** +[e.g. v0.20.0] **Additional context** Additional information that may be relevant to the issue. diff --git a/.github/ISSUE_TEMPLATE/sprint_issue.md b/.github/ISSUE_TEMPLATE/sprint_issue.md new file mode 100644 index 000000000..f6303e362 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/sprint_issue.md @@ -0,0 +1,34 @@ +--- +name: New sprint issue +about: ⚠️ Should only be used by the engine team ⚠️ +title: '' +labels: '' +assignees: '' + +--- + +Related product team resources: [roadmap card]() (_internal only_) and [PRD]() (_internal only_) +Related product discussion: +Related spec: WIP + +## Motivation + + + +## Usage + + + +Refer to the final spec to know the details and the final decisions about the usage. + +## TODO + + + +- [ ] Release a prototype +- [ ] If prototype validated, merge changes into `main` +- [ ] Update the spec + +## Impacted teams + + diff --git a/.github/workflows/manual_benchmarks.yml b/.github/workflows/benchmarks-manual.yml similarity index 99% rename from .github/workflows/manual_benchmarks.yml rename to .github/workflows/benchmarks-manual.yml index 76c6fe0fe..44793fc17 100644 --- a/.github/workflows/manual_benchmarks.yml +++ b/.github/workflows/benchmarks-manual.yml @@ -1,4 +1,4 @@ -name: Benchmarks +name: Benchmarks (manual) on: workflow_dispatch: diff --git a/.github/workflows/push_benchmarks_indexing.yml b/.github/workflows/benchmarks-push-indexing.yml similarity index 98% rename from .github/workflows/push_benchmarks_indexing.yml rename to .github/workflows/benchmarks-push-indexing.yml index 12f9f6eda..a966570e6 100644 --- a/.github/workflows/push_benchmarks_indexing.yml +++ b/.github/workflows/benchmarks-push-indexing.yml @@ -1,4 +1,4 @@ -name: Benchmarks indexing (push) +name: Benchmarks of indexing (push) on: push: diff --git a/.github/workflows/push_benchmarks_search_geo.yml b/.github/workflows/benchmarks-push-search-geo.yml similarity index 98% rename from .github/workflows/push_benchmarks_search_geo.yml rename to .github/workflows/benchmarks-push-search-geo.yml index 02661061f..1b5cacfd1 100644 --- a/.github/workflows/push_benchmarks_search_geo.yml +++ b/.github/workflows/benchmarks-push-search-geo.yml @@ -1,4 +1,4 @@ -name: Benchmarks search geo (push) +name: Benchmarks of search for geo (push) on: push: diff --git a/.github/workflows/push_benchmarks_search_songs.yml b/.github/workflows/benchmarks-push-search-songs.yml similarity index 98% rename from .github/workflows/push_benchmarks_search_songs.yml rename to .github/workflows/benchmarks-push-search-songs.yml index 92684a907..02cd10472 100644 --- a/.github/workflows/push_benchmarks_search_songs.yml +++ b/.github/workflows/benchmarks-push-search-songs.yml @@ -1,4 +1,4 @@ -name: Benchmarks search songs (push) +name: Benchmarks of search for songs (push) on: push: diff --git a/.github/workflows/push_benchmarks_search_wiki.yml b/.github/workflows/benchmarks-push-search-wiki.yml similarity index 98% rename from .github/workflows/push_benchmarks_search_wiki.yml rename to .github/workflows/benchmarks-push-search-wiki.yml index 0f6511337..455aaa95d 100644 --- a/.github/workflows/push_benchmarks_search_wiki.yml +++ b/.github/workflows/benchmarks-push-search-wiki.yml @@ -1,4 +1,4 @@ -name: Benchmarks search wikipedia articles (push) +name: Benchmarks of search for Wikipedia articles (push) on: push: diff --git a/.github/workflows/create-issue-dependencies.yml b/.github/workflows/create-issue-dependencies.yml deleted file mode 100644 index 3ad1be910..000000000 --- a/.github/workflows/create-issue-dependencies.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Create issue to upgrade dependencies -on: - schedule: - # Run the first of the month, every 3 month - - cron: '0 0 1 */3 *' - workflow_dispatch: - -jobs: - create-issue: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Create an issue - uses: actions-ecosystem/action-create-issue@v1 - with: - github_token: ${{ secrets.MEILI_BOT_GH_PAT }} - title: Upgrade dependencies - body: | - This issue is about updating Meilisearch dependencies: - - [ ] Cargo toml dependencies of Meilisearch; but also the main engine-team repositories that Meilisearch depends on (charabia, heed...) - - [ ] If new Rust versions have been released, update the Rust version in the Clippy job of this [GitHub Action file](./.github/workflows/rust.yml) - - ⚠️ To avoid last minute bugs, this issue should only be done at the beginning of the sprint! - - The GitHub action dependencies are managed by [Dependabot](./.github/dependabot.yml) - labels: | - dependencies - maintenance diff --git a/.github/workflows/dependency-issue.yml b/.github/workflows/dependency-issue.yml new file mode 100644 index 000000000..941cc4e53 --- /dev/null +++ b/.github/workflows/dependency-issue.yml @@ -0,0 +1,24 @@ +name: Create issue to upgrade dependencies + +on: + schedule: + # Run the first of the month, every 3 month + - cron: '0 0 1 */3 *' + workflow_dispatch: + +jobs: + create-issue: + runs-on: ubuntu-latest + env: + ISSUE_TEMPLATE: issue-template.md + GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }} + steps: + - uses: actions/checkout@v3 + - name: Download the issue template + run: curl -s https://raw.githubusercontent.com/meilisearch/engine-team/main/issue-templates/dependency-issue.md > $ISSUE_TEMPLATE + - name: Create issue + run: | + gh issue create \ + --title 'Upgrade dependencies' \ + --label 'dependencies,maintenance' \ + --body-file $ISSUE_TEMPLATE diff --git a/.github/workflows/flaky.yml b/.github/workflows/flaky-tests.yml similarity index 100% rename from .github/workflows/flaky.yml rename to .github/workflows/flaky-tests.yml diff --git a/.github/workflows/publish-deb-brew-pkg.yml b/.github/workflows/publish-apt-brew-pkg.yml similarity index 94% rename from .github/workflows/publish-deb-brew-pkg.yml rename to .github/workflows/publish-apt-brew-pkg.yml index 13b08d071..e24d8ccf1 100644 --- a/.github/workflows/publish-deb-brew-pkg.yml +++ b/.github/workflows/publish-apt-brew-pkg.yml @@ -1,4 +1,4 @@ -name: Publish to APT repository & Homebrew +name: Publish to APT & Homebrew on: release: @@ -35,7 +35,7 @@ jobs: - name: Build deb package run: cargo deb -p meilisearch -o target/debian/meilisearch.deb - name: Upload debian pkg to release - uses: svenstaro/upload-release-action@2.4.0 + uses: svenstaro/upload-release-action@2.5.0 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/debian/meilisearch.deb diff --git a/.github/workflows/publish-binaries.yml b/.github/workflows/publish-binaries.yml index 76dde74d1..76558f3b1 100644 --- a/.github/workflows/publish-binaries.yml +++ b/.github/workflows/publish-binaries.yml @@ -1,3 +1,5 @@ +name: Publish binaries to GitHub release + on: workflow_dispatch: schedule: @@ -5,8 +7,6 @@ on: release: types: [published] -name: Publish binaries to release - jobs: check-version: name: Check the version validity @@ -54,7 +54,7 @@ jobs: # No need to upload binaries for dry run (cron) - name: Upload binaries to release if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.4.0 + uses: svenstaro/upload-release-action@2.5.0 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/release/meilisearch @@ -87,7 +87,7 @@ jobs: # No need to upload binaries for dry run (cron) - name: Upload binaries to release if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.4.0 + uses: svenstaro/upload-release-action@2.5.0 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/release/${{ matrix.artifact_name }} @@ -121,7 +121,7 @@ jobs: - name: Upload the binary to release # No need to upload binaries for dry run (cron) if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.4.0 + uses: svenstaro/upload-release-action@2.5.0 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/${{ matrix.target }}/release/meilisearch @@ -183,7 +183,7 @@ jobs: - name: Upload the binary to release # No need to upload binaries for dry run (cron) if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.4.0 + uses: svenstaro/upload-release-action@2.5.0 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/${{ matrix.target }}/release/meilisearch diff --git a/.github/workflows/publish-docker-images.yml b/.github/workflows/publish-docker-images.yml index 39bab4d0d..9ceeaaaa4 100644 --- a/.github/workflows/publish-docker-images.yml +++ b/.github/workflows/publish-docker-images.yml @@ -1,4 +1,5 @@ ---- +name: Publish images to Docker Hub + on: push: # Will run for every tag pushed except `latest` @@ -12,8 +13,6 @@ on: - cron: '0 23 * * *' # Every day at 11:00pm workflow_dispatch: -name: Publish tagged images to Docker Hub - jobs: docker: runs-on: docker diff --git a/.github/workflows/rust.yml b/.github/workflows/test-suite.yml similarity index 50% rename from .github/workflows/rust.yml rename to .github/workflows/test-suite.yml index 5f783ca9e..820fcb656 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/test-suite.yml @@ -1,4 +1,4 @@ -name: Rust +name: Test suite on: workflow_dispatch: @@ -25,36 +25,35 @@ jobs: # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations image: ubuntu:18.04 steps: - - uses: actions/checkout@v3 - - name: Install needed dependencies - run: | - apt-get update && apt-get install -y curl - apt-get install build-essential -y - - name: Run test with Rust stable - if: github.event_name != 'schedule' - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - - name: Run test with Rust nightly - if: github.event_name == 'schedule' - uses: actions-rs/toolchain@v1 - with: - toolchain: nightly - override: true - # Disable cache due to disk space issues with Windows workers in CI - # - name: Cache dependencies - # uses: Swatinem/rust-cache@v2.2.0 - - name: Run cargo check without any default features - uses: actions-rs/cargo@v1 - with: - command: build - args: --locked --release --no-default-features --all - - name: Run cargo test - uses: actions-rs/cargo@v1 - with: - command: test - args: --locked --release --all + - uses: actions/checkout@v3 + - name: Install needed dependencies + run: | + apt-get update && apt-get install -y curl + apt-get install build-essential -y + - name: Run test with Rust stable + if: github.event_name != 'schedule' + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + - name: Run test with Rust nightly + if: github.event_name == 'schedule' + uses: actions-rs/toolchain@v1 + with: + toolchain: nightly + override: true + - name: Cache dependencies + uses: Swatinem/rust-cache@v2.2.1 + - name: Run cargo check without any default features + uses: actions-rs/cargo@v1 + with: + command: build + args: --locked --release --no-default-features --all + - name: Run cargo test + uses: actions-rs/cargo@v1 + with: + command: test + args: --locked --release --all test-others: name: Tests on ${{ matrix.os }} @@ -64,19 +63,47 @@ jobs: matrix: os: [macos-12, windows-2022] steps: - - uses: actions/checkout@v3 -# - name: Cache dependencies -# uses: Swatinem/rust-cache@v2.2.0 - - name: Run cargo check without any default features - uses: actions-rs/cargo@v1 - with: - command: build - args: --locked --release --no-default-features --all - - name: Run cargo test - uses: actions-rs/cargo@v1 - with: - command: test - args: --locked --release --all + - uses: actions/checkout@v3 + - name: Cache dependencies + uses: Swatinem/rust-cache@v2.2.1 + - name: Run cargo check without any default features + uses: actions-rs/cargo@v1 + with: + command: build + args: --locked --release --no-default-features --all + - name: Run cargo test + uses: actions-rs/cargo@v1 + with: + command: test + args: --locked --release --all + + test-all-features: + name: Tests all features on cron schedule only + runs-on: ubuntu-latest + container: + # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations + image: ubuntu:18.04 + if: github.event_name == 'schedule' + steps: + - uses: actions/checkout@v3 + - name: Install needed dependencies + run: | + apt-get update + apt-get install --assume-yes build-essential curl + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + - name: Run cargo build with all features + uses: actions-rs/cargo@v1 + with: + command: build + args: --workspace --locked --release --all-features + - name: Run cargo test with all features + uses: actions-rs/cargo@v1 + with: + command: test + args: --workspace --locked --release --all-features # We run tests in debug also, to make sure that the debug_assertions are hit test-debug: @@ -95,8 +122,8 @@ jobs: with: toolchain: stable override: true - # - name: Cache dependencies - # uses: Swatinem/rust-cache@v2.2.0 + - name: Cache dependencies + uses: Swatinem/rust-cache@v2.2.1 - name: Run tests in debug uses: actions-rs/cargo@v1 with: @@ -114,8 +141,8 @@ jobs: toolchain: 1.67.0 override: true components: clippy - # - name: Cache dependencies - # uses: Swatinem/rust-cache@v2.2.0 + - name: Cache dependencies + uses: Swatinem/rust-cache@v2.2.1 - name: Run cargo clippy uses: actions-rs/cargo@v1 with: @@ -134,8 +161,8 @@ jobs: toolchain: nightly override: true components: rustfmt - # - name: Cache dependencies - # uses: Swatinem/rust-cache@v2.2.0 + - name: Cache dependencies + uses: Swatinem/rust-cache@v2.2.1 - name: Run cargo fmt # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate diff --git a/.github/workflows/uffizzi-build.yml b/.github/workflows/uffizzi-build.yml index 934d91522..1e2ae7018 100644 --- a/.github/workflows/uffizzi-build.yml +++ b/.github/workflows/uffizzi-build.yml @@ -23,7 +23,7 @@ jobs: target: x86_64-unknown-linux-musl - name: Cache dependencies - uses: Swatinem/rust-cache@v2.2.0 + uses: Swatinem/rust-cache@v2.2.1 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 @@ -46,14 +46,14 @@ jobs: - name: Docker metadata id: meta - uses: docker/metadata-action@v3 + uses: docker/metadata-action@v4 with: images: registry.uffizzi.com/${{ env.UUID_TAG }} tags: | type=raw,value=60d - name: Build Image - uses: docker/build-push-action@v3 + uses: docker/build-push-action@v4 with: context: ./ file: .github/uffizzi/Dockerfile diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index b402985e3..d70062c97 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -31,6 +31,7 @@ mod uuid_codec; pub type Result = std::result::Result; pub type TaskId = u32; +use std::collections::HashMap; use std::ops::{Bound, RangeBounds}; use std::path::{Path, PathBuf}; use std::sync::atomic::AtomicBool; @@ -43,7 +44,7 @@ pub use error::Error; use file_store::FileStore; use meilisearch_types::error::ResponseError; use meilisearch_types::heed::types::{OwnedType, SerdeBincode, SerdeJson, Str}; -use meilisearch_types::heed::{self, Database, Env, RoTxn}; +use meilisearch_types::heed::{self, Database, Env, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; @@ -882,127 +883,8 @@ impl IndexScheduler { /// Register a new task coming from a dump in the scheduler. /// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running. - pub fn register_dumped_task( - &mut self, - task: TaskDump, - content_file: Option>, - ) -> Result { - // Currently we don't need to access the tasks queue while loading a dump thus I can block everything. - let mut wtxn = self.env.write_txn()?; - - let content_uuid = match content_file { - Some(content_file) if task.status == Status::Enqueued => { - let (uuid, mut file) = self.create_update_file()?; - let mut builder = DocumentsBatchBuilder::new(file.as_file_mut()); - for doc in content_file { - builder.append_json_object(&doc?)?; - } - builder.into_inner()?; - file.persist()?; - - Some(uuid) - } - // If the task isn't `Enqueued` then just generate a recognisable `Uuid` - // in case we try to open it later. - _ if task.status != Status::Enqueued => Some(Uuid::nil()), - _ => None, - }; - - let task = Task { - uid: task.uid, - enqueued_at: task.enqueued_at, - started_at: task.started_at, - finished_at: task.finished_at, - error: task.error, - canceled_by: task.canceled_by, - details: task.details, - status: task.status, - kind: match task.kind { - KindDump::DocumentImport { - primary_key, - method, - documents_count, - allow_index_creation, - } => KindWithContent::DocumentAdditionOrUpdate { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - primary_key, - method, - content_file: content_uuid.ok_or(Error::CorruptedDump)?, - documents_count, - allow_index_creation, - }, - KindDump::DocumentDeletion { documents_ids } => KindWithContent::DocumentDeletion { - documents_ids, - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - }, - KindDump::DocumentClear => KindWithContent::DocumentClear { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - }, - KindDump::Settings { settings, is_deletion, allow_index_creation } => { - KindWithContent::SettingsUpdate { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - new_settings: settings, - is_deletion, - allow_index_creation, - } - } - KindDump::IndexDeletion => KindWithContent::IndexDeletion { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - }, - KindDump::IndexCreation { primary_key } => KindWithContent::IndexCreation { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - primary_key, - }, - KindDump::IndexUpdate { primary_key } => KindWithContent::IndexUpdate { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - primary_key, - }, - KindDump::IndexSwap { swaps } => KindWithContent::IndexSwap { swaps }, - KindDump::TaskCancelation { query, tasks } => { - KindWithContent::TaskCancelation { query, tasks } - } - KindDump::TasksDeletion { query, tasks } => { - KindWithContent::TaskDeletion { query, tasks } - } - KindDump::DumpCreation { keys, instance_uid } => { - KindWithContent::DumpCreation { keys, instance_uid } - } - KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, - }, - }; - - self.all_tasks.put(&mut wtxn, &BEU32::new(task.uid), &task)?; - - for index in task.indexes() { - self.update_index(&mut wtxn, index, |bitmap| { - bitmap.insert(task.uid); - })?; - } - - self.update_status(&mut wtxn, task.status, |bitmap| { - bitmap.insert(task.uid); - })?; - - self.update_kind(&mut wtxn, task.kind.as_kind(), |bitmap| { - (bitmap.insert(task.uid)); - })?; - - utils::insert_task_datetime(&mut wtxn, self.enqueued_at, task.enqueued_at, task.uid)?; - - // we can't override the started_at & finished_at, so we must only set it if the tasks is finished and won't change - if matches!(task.status, Status::Succeeded | Status::Failed | Status::Canceled) { - if let Some(started_at) = task.started_at { - utils::insert_task_datetime(&mut wtxn, self.started_at, started_at, task.uid)?; - } - if let Some(finished_at) = task.finished_at { - utils::insert_task_datetime(&mut wtxn, self.finished_at, finished_at, task.uid)?; - } - } - - wtxn.commit()?; - self.wake_up.signal(); - - Ok(task) + pub fn register_dumped_task(&mut self) -> Result { + Dump::new(self) } /// Create a new index without any associated task. @@ -1237,6 +1119,184 @@ impl IndexScheduler { } } +pub struct Dump<'a> { + index_scheduler: &'a IndexScheduler, + wtxn: RwTxn<'a, 'a>, + + indexes: HashMap, + statuses: HashMap, + kinds: HashMap, +} + +impl<'a> Dump<'a> { + pub(crate) fn new(index_scheduler: &'a mut IndexScheduler) -> Result { + // While loading a dump no one should be able to access the scheduler thus I can block everything. + let wtxn = index_scheduler.env.write_txn()?; + + Ok(Dump { + index_scheduler, + wtxn, + indexes: HashMap::new(), + statuses: HashMap::new(), + kinds: HashMap::new(), + }) + } + + /// Register a new task coming from a dump in the scheduler. + /// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running. + pub fn register_dumped_task( + &mut self, + task: TaskDump, + content_file: Option>, + ) -> Result { + let content_uuid = match content_file { + Some(content_file) if task.status == Status::Enqueued => { + let (uuid, mut file) = self.index_scheduler.create_update_file()?; + let mut builder = DocumentsBatchBuilder::new(file.as_file_mut()); + for doc in content_file { + builder.append_json_object(&doc?)?; + } + builder.into_inner()?; + file.persist()?; + + Some(uuid) + } + // If the task isn't `Enqueued` then just generate a recognisable `Uuid` + // in case we try to open it later. + _ if task.status != Status::Enqueued => Some(Uuid::nil()), + _ => None, + }; + + let task = Task { + uid: task.uid, + enqueued_at: task.enqueued_at, + started_at: task.started_at, + finished_at: task.finished_at, + error: task.error, + canceled_by: task.canceled_by, + details: task.details, + status: task.status, + kind: match task.kind { + KindDump::DocumentImport { + primary_key, + method, + documents_count, + allow_index_creation, + } => KindWithContent::DocumentAdditionOrUpdate { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + primary_key, + method, + content_file: content_uuid.ok_or(Error::CorruptedDump)?, + documents_count, + allow_index_creation, + }, + KindDump::DocumentDeletion { documents_ids } => KindWithContent::DocumentDeletion { + documents_ids, + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + }, + KindDump::DocumentClear => KindWithContent::DocumentClear { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + }, + KindDump::Settings { settings, is_deletion, allow_index_creation } => { + KindWithContent::SettingsUpdate { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + new_settings: settings, + is_deletion, + allow_index_creation, + } + } + KindDump::IndexDeletion => KindWithContent::IndexDeletion { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + }, + KindDump::IndexCreation { primary_key } => KindWithContent::IndexCreation { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + primary_key, + }, + KindDump::IndexUpdate { primary_key } => KindWithContent::IndexUpdate { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + primary_key, + }, + KindDump::IndexSwap { swaps } => KindWithContent::IndexSwap { swaps }, + KindDump::TaskCancelation { query, tasks } => { + KindWithContent::TaskCancelation { query, tasks } + } + KindDump::TasksDeletion { query, tasks } => { + KindWithContent::TaskDeletion { query, tasks } + } + KindDump::DumpCreation { keys, instance_uid } => { + KindWithContent::DumpCreation { keys, instance_uid } + } + KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, + }, + }; + + self.index_scheduler.all_tasks.put(&mut self.wtxn, &BEU32::new(task.uid), &task)?; + + for index in task.indexes() { + match self.indexes.get_mut(index) { + Some(bitmap) => { + bitmap.insert(task.uid); + } + None => { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(task.uid); + self.indexes.insert(index.to_string(), bitmap); + } + }; + } + + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.enqueued_at, + task.enqueued_at, + task.uid, + )?; + + // we can't override the started_at & finished_at, so we must only set it if the tasks is finished and won't change + if matches!(task.status, Status::Succeeded | Status::Failed | Status::Canceled) { + if let Some(started_at) = task.started_at { + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.started_at, + started_at, + task.uid, + )?; + } + if let Some(finished_at) = task.finished_at { + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.finished_at, + finished_at, + task.uid, + )?; + } + } + + self.statuses.entry(task.status).or_insert(RoaringBitmap::new()).insert(task.uid); + self.kinds.entry(task.kind.as_kind()).or_insert(RoaringBitmap::new()).insert(task.uid); + + Ok(task) + } + + /// Commit all the changes and exit the importing dump state + pub fn finish(mut self) -> Result<()> { + for (index, bitmap) in self.indexes { + self.index_scheduler.index_tasks.put(&mut self.wtxn, &index, &bitmap)?; + } + for (status, bitmap) in self.statuses { + self.index_scheduler.put_status(&mut self.wtxn, status, &bitmap)?; + } + for (kind, bitmap) in self.kinds { + self.index_scheduler.put_kind(&mut self.wtxn, kind, &bitmap)?; + } + + self.wtxn.commit()?; + self.index_scheduler.wake_up.signal(); + + Ok(()) + } +} + /// The outcome of calling the [`IndexScheduler::tick`] function. pub enum TickOutcome { /// The scheduler should immediately attempt another `tick`. diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 13c236983..98e754e67 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -367,12 +367,14 @@ fn import_dump( log::info!("All documents successfully imported."); } + let mut index_scheduler_dump = index_scheduler.register_dumped_task()?; + // 4. Import the tasks. for ret in dump_reader.tasks()? { let (task, file) = ret?; - index_scheduler.register_dumped_task(task, file)?; + index_scheduler_dump.register_dumped_task(task, file)?; } - Ok(()) + Ok(index_scheduler_dump.finish()?) } pub fn configure_data( diff --git a/meilisearch/tests/documents/add_documents.rs b/meilisearch/tests/documents/add_documents.rs index 612a2cdb6..164d68582 100644 --- a/meilisearch/tests/documents/add_documents.rs +++ b/meilisearch/tests/documents/add_documents.rs @@ -279,6 +279,81 @@ async fn add_csv_document() { "###); } +#[actix_rt::test] +async fn add_csv_document_with_types() { + let server = Server::new().await; + let index = server.index("pets"); + + let document = "#id:number,name:string,race:string,age:number,cute:boolean +0,jean,bernese mountain,2.5,true +1,,,, +2,lilou,pug,-2,false"; + + let (response, code) = index.raw_update_documents(document, Some("text/csv"), "").await; + snapshot!(code, @"202 Accepted"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "taskUid": 0, + "indexUid": "pets", + "status": "enqueued", + "type": "documentAdditionOrUpdate", + "enqueuedAt": "[date]" + } + "###); + let response = index.wait_task(response["taskUid"].as_u64().unwrap()).await; + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" + { + "uid": 0, + "indexUid": "pets", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 3, + "indexedDocuments": 3 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "#id": 0, + "name": "jean", + "race": "bernese mountain", + "age": 2.5, + "cute": true + }, + { + "#id": 1, + "name": null, + "race": null, + "age": null, + "cute": null + }, + { + "#id": 2, + "name": "lilou", + "race": "pug", + "age": -2, + "cute": false + } + ], + "offset": 0, + "limit": 20, + "total": 3 + } + "###); +} + #[actix_rt::test] async fn add_csv_document_with_custom_delimiter() { let server = Server::new().await; @@ -343,6 +418,40 @@ async fn add_csv_document_with_custom_delimiter() { "###); } +#[actix_rt::test] +async fn add_csv_document_with_types_error() { + let server = Server::new().await; + let index = server.index("pets"); + + let document = "#id:number,a:boolean,b:number +0,doggo,1"; + + let (response, code) = index.raw_update_documents(document, Some("text/csv"), "").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "message": "The `csv` payload provided is malformed: `Error parsing boolean \"doggo\" at line 1: provided string was not `true` or `false``.", + "code": "malformed_payload", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#malformed_payload" + } + "###); + + let document = "#id:number,a:boolean,b:number +0,true,doggo"; + + let (response, code) = index.raw_update_documents(document, Some("text/csv"), "").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "message": "The `csv` payload provided is malformed: `Error parsing number \"doggo\" at line 1: invalid float literal`.", + "code": "malformed_payload", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#malformed_payload" + } + "###); +} + /// any other content-type is must be refused #[actix_rt::test] async fn error_add_documents_test_bad_content_types() { diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index bbc49ea7d..bde0dd440 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -81,6 +81,8 @@ impl FromStr for Member { if is_reserved_keyword(text) || text.starts_with("_geoRadius(") || text.starts_with("_geoBoundingBox(") + || text.starts_with("_geo(") + || text.starts_with("_geoDistance(") { return Err(AscDescError::ReservedKeyword { name: text.to_string() })?; } @@ -265,6 +267,13 @@ mod tests { ("_geoPoint(0, -180.000001):desc", GeoError(BadGeoError::Lng(-180.000001))), ("_geoPoint(159.256, 130):asc", GeoError(BadGeoError::Lat(159.256))), ("_geoPoint(12, -2021):desc", GeoError(BadGeoError::Lng(-2021.))), + ("_geo(12, -2021):asc", ReservedKeyword { name: S("_geo(12, -2021)") }), + ("_geo(12, -2021):desc", ReservedKeyword { name: S("_geo(12, -2021)") }), + ("_geoDistance(12, -2021):asc", ReservedKeyword { name: S("_geoDistance(12, -2021)") }), + ( + "_geoDistance(12, -2021):desc", + ReservedKeyword { name: S("_geoDistance(12, -2021)") }, + ), ]; for (req, expected_error) in invalid_req { diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 1fa59168e..e5124f67f 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -114,14 +114,15 @@ impl DocumentsBatchBuilder { self.value_buffer.clear(); let value = &record[*i]; + let trimmed_value = value.trim(); match type_ { AllowedType::Number => { - if value.trim().is_empty() { + if trimmed_value.is_empty() { to_writer(&mut self.value_buffer, &Value::Null)?; - } else if let Ok(integer) = value.trim().parse::() { + } else if let Ok(integer) = trimmed_value.parse::() { to_writer(&mut self.value_buffer, &integer)?; } else { - match value.trim().parse::() { + match trimmed_value.parse::() { Ok(float) => { to_writer(&mut self.value_buffer, &float)?; } @@ -135,6 +136,24 @@ impl DocumentsBatchBuilder { } } } + AllowedType::Boolean => { + if trimmed_value.is_empty() { + to_writer(&mut self.value_buffer, &Value::Null)?; + } else { + match trimmed_value.parse::() { + Ok(bool) => { + to_writer(&mut self.value_buffer, &bool)?; + } + Err(error) => { + return Err(Error::ParseBool { + error, + line, + value: value.to_string(), + }); + } + } + } + } AllowedType::String => { if value.is_empty() { to_writer(&mut self.value_buffer, &Value::Null)?; @@ -173,6 +192,7 @@ impl DocumentsBatchBuilder { #[derive(Debug)] enum AllowedType { String, + Boolean, Number, } @@ -181,6 +201,7 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) { match header.rsplit_once(':') { Some((field_name, field_type)) => match field_type { "string" => (field_name, AllowedType::String), + "boolean" => (field_name, AllowedType::Boolean), "number" => (field_name, AllowedType::Number), // if the pattern isn't reconized, we keep the whole field. _otherwise => (header, AllowedType::String), diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index da3a07942..43b31187d 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -3,7 +3,7 @@ mod enriched; mod reader; mod serde_impl; -use std::fmt::{self, Debug}; +use std::fmt::Debug; use std::io; use std::str::Utf8Error; @@ -87,71 +87,30 @@ impl DocumentsBatchIndex { } } -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum Error { + #[error("Error parsing number {value:?} at line {line}: {error}")] ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, + #[error("Error parsing boolean {value:?} at line {line}: {error}")] + ParseBool { error: std::str::ParseBoolError, line: usize, value: String }, + #[error("Invalid document addition format, missing the documents batch index.")] InvalidDocumentFormat, + #[error("Invalid enriched data.")] InvalidEnrichedData, - InvalidUtf8(Utf8Error), - Csv(csv::Error), - Json(serde_json::Error), + #[error(transparent)] + InvalidUtf8(#[from] Utf8Error), + #[error(transparent)] + Csv(#[from] csv::Error), + #[error(transparent)] + Json(#[from] serde_json::Error), + #[error(transparent)] Serialize(serde_json::Error), - Grenad(grenad::Error), - Io(io::Error), + #[error(transparent)] + Grenad(#[from] grenad::Error), + #[error(transparent)] + Io(#[from] io::Error), } -impl From for Error { - fn from(e: csv::Error) -> Self { - Self::Csv(e) - } -} - -impl From for Error { - fn from(other: io::Error) -> Self { - Self::Io(other) - } -} - -impl From for Error { - fn from(other: serde_json::Error) -> Self { - Self::Json(other) - } -} - -impl From for Error { - fn from(other: grenad::Error) -> Self { - Self::Grenad(other) - } -} - -impl From for Error { - fn from(other: Utf8Error) -> Self { - Self::InvalidUtf8(other) - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Error::ParseFloat { error, line, value } => { - write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) - } - Error::InvalidDocumentFormat => { - f.write_str("Invalid document addition format, missing the documents batch index.") - } - Error::InvalidEnrichedData => f.write_str("Invalid enriched data."), - Error::InvalidUtf8(e) => write!(f, "{}", e), - Error::Io(e) => write!(f, "{}", e), - Error::Serialize(e) => write!(f, "{}", e), - Error::Grenad(e) => write!(f, "{}", e), - Error::Csv(e) => write!(f, "{}", e), - Error::Json(e) => write!(f, "{}", e), - } - } -} - -impl std::error::Error for Error {} - #[cfg(test)] pub fn objects_from_json_value(json: serde_json::Value) -> Vec { let documents = match json { @@ -274,6 +233,19 @@ mod test { ]); } + #[test] + fn csv_types_dont_panic() { + let csv1_content = + "id:number,b:boolean,c,d:number\n1,,,\n2,true,doggo,2\n3,false,the best doggo,-2\n4,,\"Hello, World!\",2.5"; + let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv1).unwrap(); + let vector = builder.into_inner().unwrap(); + + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + } + #[test] fn out_of_order_csv_fields() { let csv1_content = "id:number,b\n1,0"; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 4f4fa25d6..3e271924b 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -565,8 +565,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_primary_key(self.wtxn, primary_key)?; Ok(()) } else { - let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); - Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) + let curr_primary_key = self.index.primary_key(self.wtxn)?.unwrap().to_string(); + if primary_key == &curr_primary_key { + Ok(()) + } else { + Err(UserError::PrimaryKeyCannotBeChanged(curr_primary_key).into()) + } } } Setting::Reset => { @@ -1332,6 +1336,17 @@ mod tests { .unwrap(); wtxn.commit().unwrap(); + // Updating settings with the same primary key should do nothing + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("mykey")); + }) + .unwrap(); + assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); + wtxn.commit().unwrap(); + + // Updating the settings with a different (or no) primary key causes an error let mut wtxn = index.write_txn().unwrap(); let error = index .update_settings_using_wtxn(&mut wtxn, |settings| {